All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/17] x86: split insn emulator decode and execution
@ 2016-09-08 12:58 Jan Beulich
  2016-09-08 13:04 ` [PATCH 01/17] x86emul: split instruction decoding from execution Jan Beulich
                   ` (17 more replies)
  0 siblings, 18 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 12:58 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

..., complete the decoder, leverage decoding for SVM instruction
sizing and PV 32-bit call gate emulation, and use the emulator for
PV priv-op handling.

01: x86emul: split instruction decoding from execution
02: x86emul: fetch all insn bytes during the decode phase
03: x86emul: track only rIP in emulator state
04: x86emul: complete decoding of two-byte instructions
05: x86emul: add XOP decoding
06: x86emul: add EVEX decoding
07: x86emul: move x86_execute() common epilogue code
08: x86emul: generate and make use of canonical opcode representation
09: SVM: use generic instruction decoding
10: x86/32on64: use generic instruction decoding
11: x86/PV: split out dealing with CRn from privileged instruction handling
12: x86/PV: split out dealing with DRn from privileged instruction handling
13: x86/PV: split out dealing with MSRs from privileged instruction handling
14: x86emul: support XSETBV
15: x86emul: sort opcode 0f01 special case switch() statement
16: x86/PV: use generic emulator for privileged instruction handling
17: x86emul: don't assume a memory operand

Signed-off-by: Jan Beulich <jbeulich@suse.com>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 01/17] x86emul: split instruction decoding from execution
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
@ 2016-09-08 13:04 ` Jan Beulich
  2016-09-09 18:35   ` Andrew Cooper
  2016-09-08 13:07 ` [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase Jan Beulich
                   ` (16 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:04 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7443 bytes --]

This is only the mechanical part, a subsequent patch will make non-
mechanical adjustments to actually do all decoding in this new
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -48,7 +48,9 @@
 /* All operands are implicit in the opcode. */
 #define ImplicitOps (DstImplicit|SrcImplicit)
 
-static uint8_t opcode_table[256] = {
+typedef uint8_t opcode_desc_t;
+
+static const opcode_desc_t opcode_table[256] = {
     /* 0x00 - 0x07 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
@@ -178,7 +180,7 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
 };
 
-static uint8_t twobyte_table[256] = {
+static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
     SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
     /* 0x08 - 0x0F */
@@ -607,7 +609,7 @@ do{ asm volatile (
 })
 #define truncate_ea(ea) truncate_word((ea), ad_bytes)
 
-#define mode_64bit() (def_ad_bytes == 8)
+#define mode_64bit() (ctxt->addr_size == 64)
 
 #define fail_if(p)                                      \
 do {                                                    \
@@ -1558,32 +1560,63 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
-int
-x86_emulate(
-    struct x86_emulate_ctxt *ctxt,
-    const struct x86_emulate_ops  *ops)
-{
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs _regs = *ctxt->regs;
+struct x86_emulate_state {
+    unsigned int op_bytes, ad_bytes;
+
+    enum { ext_none, ext_0f, ext_0f38 } ext;
+    uint8_t opcode;
+    uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t rex_prefix;
+    bool lock_prefix;
+    opcode_desc_t desc;
+    union vex vex;
+    int override_seg;
 
-    uint8_t b, d, sib, sib_index, sib_base, rex_prefix = 0;
-    uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-    enum { ext_none, ext_0f, ext_0f38 } ext = ext_none;
-    union vex vex = {};
-    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
-    bool_t lock_prefix = 0;
-    int override_seg = -1, rc = X86EMUL_OKAY;
-    struct operand src = { .reg = REG_POISON };
-    struct operand dst = { .reg = REG_POISON };
-    enum x86_swint_type swint_type;
-    struct x86_emulate_stub stub = {};
-    DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
      * Default is a memory operand relative to segment DS.
      */
-    struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
-    ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
+    struct operand ea;
+
+    /* Immediate operand values, if any. Use otherwise unused fields. */
+#define imm1 ea.val
+#define imm2 ea.orig_val
+
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs regs;
+};
+
+/* Helper definitions. */
+#define op_bytes (state->op_bytes)
+#define ad_bytes (state->ad_bytes)
+#define ext (state->ext)
+#define modrm (state->modrm)
+#define modrm_mod (state->modrm_mod)
+#define modrm_reg (state->modrm_reg)
+#define modrm_rm (state->modrm_rm)
+#define rex_prefix (state->rex_prefix)
+#define lock_prefix (state->lock_prefix)
+#define vex (state->vex)
+#define override_seg (state->override_seg)
+#define ea (state->ea)
+#define _regs (state->regs)
+
+static int
+x86_decode(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops  *ops)
+{
+    uint8_t b, d, sib, sib_index, sib_base;
+    unsigned int def_op_bytes, def_ad_bytes;
+    int rc = X86EMUL_OKAY;
+
+    memset(state, 0, sizeof(*state));
+    override_seg = -1;
+    ea.type = OP_MEM;
+    ea.mem.seg = x86_seg_ds;
+    ea.reg = REG_POISON;
+    _regs = *ctxt->regs;
 
     ctxt->retire.byte = 0;
 
@@ -1800,7 +1833,7 @@ x86_emulate(
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
             default: /* Until it is worth making this table based ... */
-                goto cannot_emulate;
+                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -1932,6 +1965,61 @@ x86_emulate(
     if ( override_seg != -1 && ea.type == OP_MEM )
         ea.mem.seg = override_seg;
 
+    /* Fetch the immediate operand, if present. */
+    switch ( d & SrcMask )
+    {
+        unsigned int bytes;
+
+    case SrcImm:
+        if ( !(d & ByteOp) )
+            bytes = op_bytes != 8 ? op_bytes : 4;
+        else
+        {
+    case SrcImmByte:
+            bytes = 1;
+        }
+        /* NB. Immediates are sign-extended as necessary. */
+        switch ( bytes )
+        {
+        case 1: imm1 = insn_fetch_type(int8_t);  break;
+        case 2: imm1 = insn_fetch_type(int16_t); break;
+        case 4: imm1 = insn_fetch_type(int32_t); break;
+        }
+        break;
+    case SrcImm16:
+        imm1 = insn_fetch_type(uint16_t);
+        break;
+    }
+
+    state->opcode = b;
+    state->desc = d;
+
+ done:
+    return rc;
+}
+
+int
+x86_emulate(
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    struct x86_emulate_state state;
+    int rc;
+    uint8_t b, d;
+    struct operand src = { .reg = REG_POISON };
+    struct operand dst = { .reg = REG_POISON };
+    enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
+    DECLARE_ALIGNED(mmval_t, mmval);
+
+    rc = x86_decode(&state, ctxt, ops);
+    if ( rc != X86EMUL_OKAY)
+        return rc;
+
+    b = state.opcode;
+    d = state.desc;
+#define state (&state)
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -1987,18 +2075,12 @@ x86_emulate(
             src.bytes = 1;
         }
         src.type  = OP_IMM;
-        /* NB. Immediates are sign-extended as necessary. */
-        switch ( src.bytes )
-        {
-        case 1: src.val = insn_fetch_type(int8_t);  break;
-        case 2: src.val = insn_fetch_type(int16_t); break;
-        case 4: src.val = insn_fetch_type(int32_t); break;
-        }
+        src.val   = imm1;
         break;
     case SrcImm16:
         src.type  = OP_IMM;
         src.bytes = 2;
-        src.val   = insn_fetch_type(uint16_t);
+        src.val   = imm1;
         break;
     }
 
@@ -3892,8 +3974,8 @@ x86_emulate(
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
-    /* Zero the upper 32 bits of %rip if not in long mode. */
-    if ( def_ad_bytes < sizeof(_regs.eip) )
+    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
+    if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
     *ctxt->regs = _regs;
@@ -4876,4 +4958,19 @@ x86_emulate(
     _put_fpu();
     put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
+#undef state
 }
+
+#undef op_bytes
+#undef ad_bytes
+#undef ext
+#undef modrm
+#undef modrm_mod
+#undef modrm_reg
+#undef modrm_rm
+#undef rex_prefix
+#undef lock_prefix
+#undef vex
+#undef override_seg
+#undef ea
+#undef _regs



[-- Attachment #2: x86emul-split-decode.patch --]
[-- Type: text/plain, Size: 7493 bytes --]

x86emul: split instruction decoding from execution

This is only the mechanical part, a subsequent patch will make non-
mechanical adjustments to actually do all decoding in this new
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -48,7 +48,9 @@
 /* All operands are implicit in the opcode. */
 #define ImplicitOps (DstImplicit|SrcImplicit)
 
-static uint8_t opcode_table[256] = {
+typedef uint8_t opcode_desc_t;
+
+static const opcode_desc_t opcode_table[256] = {
     /* 0x00 - 0x07 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
@@ -178,7 +180,7 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
 };
 
-static uint8_t twobyte_table[256] = {
+static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
     SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
     /* 0x08 - 0x0F */
@@ -607,7 +609,7 @@ do{ asm volatile (
 })
 #define truncate_ea(ea) truncate_word((ea), ad_bytes)
 
-#define mode_64bit() (def_ad_bytes == 8)
+#define mode_64bit() (ctxt->addr_size == 64)
 
 #define fail_if(p)                                      \
 do {                                                    \
@@ -1558,32 +1560,63 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
-int
-x86_emulate(
-    struct x86_emulate_ctxt *ctxt,
-    const struct x86_emulate_ops  *ops)
-{
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs _regs = *ctxt->regs;
+struct x86_emulate_state {
+    unsigned int op_bytes, ad_bytes;
+
+    enum { ext_none, ext_0f, ext_0f38 } ext;
+    uint8_t opcode;
+    uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t rex_prefix;
+    bool lock_prefix;
+    opcode_desc_t desc;
+    union vex vex;
+    int override_seg;
 
-    uint8_t b, d, sib, sib_index, sib_base, rex_prefix = 0;
-    uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-    enum { ext_none, ext_0f, ext_0f38 } ext = ext_none;
-    union vex vex = {};
-    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
-    bool_t lock_prefix = 0;
-    int override_seg = -1, rc = X86EMUL_OKAY;
-    struct operand src = { .reg = REG_POISON };
-    struct operand dst = { .reg = REG_POISON };
-    enum x86_swint_type swint_type;
-    struct x86_emulate_stub stub = {};
-    DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
      * Default is a memory operand relative to segment DS.
      */
-    struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
-    ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
+    struct operand ea;
+
+    /* Immediate operand values, if any. Use otherwise unused fields. */
+#define imm1 ea.val
+#define imm2 ea.orig_val
+
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs regs;
+};
+
+/* Helper definitions. */
+#define op_bytes (state->op_bytes)
+#define ad_bytes (state->ad_bytes)
+#define ext (state->ext)
+#define modrm (state->modrm)
+#define modrm_mod (state->modrm_mod)
+#define modrm_reg (state->modrm_reg)
+#define modrm_rm (state->modrm_rm)
+#define rex_prefix (state->rex_prefix)
+#define lock_prefix (state->lock_prefix)
+#define vex (state->vex)
+#define override_seg (state->override_seg)
+#define ea (state->ea)
+#define _regs (state->regs)
+
+static int
+x86_decode(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops  *ops)
+{
+    uint8_t b, d, sib, sib_index, sib_base;
+    unsigned int def_op_bytes, def_ad_bytes;
+    int rc = X86EMUL_OKAY;
+
+    memset(state, 0, sizeof(*state));
+    override_seg = -1;
+    ea.type = OP_MEM;
+    ea.mem.seg = x86_seg_ds;
+    ea.reg = REG_POISON;
+    _regs = *ctxt->regs;
 
     ctxt->retire.byte = 0;
 
@@ -1800,7 +1833,7 @@ x86_emulate(
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
             default: /* Until it is worth making this table based ... */
-                goto cannot_emulate;
+                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -1932,6 +1965,61 @@ x86_emulate(
     if ( override_seg != -1 && ea.type == OP_MEM )
         ea.mem.seg = override_seg;
 
+    /* Fetch the immediate operand, if present. */
+    switch ( d & SrcMask )
+    {
+        unsigned int bytes;
+
+    case SrcImm:
+        if ( !(d & ByteOp) )
+            bytes = op_bytes != 8 ? op_bytes : 4;
+        else
+        {
+    case SrcImmByte:
+            bytes = 1;
+        }
+        /* NB. Immediates are sign-extended as necessary. */
+        switch ( bytes )
+        {
+        case 1: imm1 = insn_fetch_type(int8_t);  break;
+        case 2: imm1 = insn_fetch_type(int16_t); break;
+        case 4: imm1 = insn_fetch_type(int32_t); break;
+        }
+        break;
+    case SrcImm16:
+        imm1 = insn_fetch_type(uint16_t);
+        break;
+    }
+
+    state->opcode = b;
+    state->desc = d;
+
+ done:
+    return rc;
+}
+
+int
+x86_emulate(
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    struct x86_emulate_state state;
+    int rc;
+    uint8_t b, d;
+    struct operand src = { .reg = REG_POISON };
+    struct operand dst = { .reg = REG_POISON };
+    enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
+    DECLARE_ALIGNED(mmval_t, mmval);
+
+    rc = x86_decode(&state, ctxt, ops);
+    if ( rc != X86EMUL_OKAY)
+        return rc;
+
+    b = state.opcode;
+    d = state.desc;
+#define state (&state)
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -1987,18 +2075,12 @@ x86_emulate(
             src.bytes = 1;
         }
         src.type  = OP_IMM;
-        /* NB. Immediates are sign-extended as necessary. */
-        switch ( src.bytes )
-        {
-        case 1: src.val = insn_fetch_type(int8_t);  break;
-        case 2: src.val = insn_fetch_type(int16_t); break;
-        case 4: src.val = insn_fetch_type(int32_t); break;
-        }
+        src.val   = imm1;
         break;
     case SrcImm16:
         src.type  = OP_IMM;
         src.bytes = 2;
-        src.val   = insn_fetch_type(uint16_t);
+        src.val   = imm1;
         break;
     }
 
@@ -3892,8 +3974,8 @@ x86_emulate(
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
-    /* Zero the upper 32 bits of %rip if not in long mode. */
-    if ( def_ad_bytes < sizeof(_regs.eip) )
+    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
+    if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
     *ctxt->regs = _regs;
@@ -4876,4 +4958,19 @@ x86_emulate(
     _put_fpu();
     put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
+#undef state
 }
+
+#undef op_bytes
+#undef ad_bytes
+#undef ext
+#undef modrm
+#undef modrm_mod
+#undef modrm_reg
+#undef modrm_rm
+#undef rex_prefix
+#undef lock_prefix
+#undef vex
+#undef override_seg
+#undef ea
+#undef _regs

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
  2016-09-08 13:04 ` [PATCH 01/17] x86emul: split instruction decoding from execution Jan Beulich
@ 2016-09-08 13:07 ` Jan Beulich
  2016-09-13 18:44   ` Andrew Cooper
  2016-09-08 13:08 ` [PATCH 04/17] x86emul: track only rIP in emulator state Jan Beulich
                   ` (15 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 6670 bytes --]

This way we can offer to callers the service of just sizing
instructions, and we also can better guarantee not to raise the wrong
fault due to not having read all relevant bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -129,8 +129,8 @@ static const opcode_desc_t opcode_table[
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, ImplicitOps,
     /* 0xA0 - 0xA7 */
-    ByteOp|DstEax|SrcImplicit|Mov, DstEax|SrcImplicit|Mov,
-    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|DstEax|SrcMem|Mov, DstEax|SrcMem|Mov,
+    ByteOp|DstMem|SrcEax|Mov, DstMem|SrcEax|Mov,
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps, ImplicitOps,
     /* 0xA8 - 0xAF */
@@ -1602,6 +1602,45 @@ struct x86_emulate_state {
 #define _regs (state->regs)
 
 static int
+x86_decode_base(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x9a: /* call (far, absolute) */
+    case 0xea: /* jmp (far, absolute) */
+        generate_exception_if(mode_64bit(), EXC_UD, -1);
+
+        imm1 = insn_fetch_bytes(op_bytes);
+        imm2 = insn_fetch_type(uint16_t);
+        break;
+
+    case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
+        /* Source EA is not encoded via ModRM. */
+        ea.mem.off = insn_fetch_bytes(ad_bytes);
+        break;
+
+    case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
+        if ( op_bytes == 8 ) /* Fetch more bytes to obtain imm64. */
+            imm1 = ((uint32_t)imm1 |
+                    ((uint64_t)insn_fetch_type(uint32_t) << 32));
+        break;
+
+    case 0xc8: /* enter imm16,imm8 */
+        imm2 = insn_fetch_type(uint8_t);
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -1994,10 +2033,29 @@ x86_decode(
     state->opcode = b;
     state->desc = d;
 
+    switch ( ext )
+    {
+    case ext_none:
+        rc = x86_decode_base(state, ctxt, ops);
+        break;
+
+    case ext_0f:
+    case ext_0f38:
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
  done:
     return rc;
 }
 
+/* No insn fetching past this point. */
+#undef insn_fetch_bytes
+#undef insn_fetch_type
+
 int
 x86_emulate(
     struct x86_emulate_ctxt *ctxt,
@@ -2560,6 +2618,8 @@ x86_emulate(
     case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
         generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1);
     case 0x88 ... 0x8b: /* mov */
+    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         dst.val = src.val;
         break;
 
@@ -2644,18 +2704,13 @@ x86_emulate(
 
     case 0x9a: /* call (far, absolute) */ {
         struct segment_register reg;
-        uint16_t sel;
-        uint32_t eip;
 
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
+        ASSERT(!mode_64bit());
         fail_if(ops->read_segment == NULL);
 
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-
         if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
-             (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (validate_far_branch(&cs, eip),
+             (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (validate_far_branch(&cs, imm1),
               rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                               &reg.sel, op_bytes, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
@@ -2663,7 +2718,7 @@ x86_emulate(
              (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) )
             goto done;
 
-        _regs.eip = eip;
+        _regs.eip = imm1;
         break;
     }
 
@@ -2706,23 +2761,6 @@ x86_emulate(
         ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02;
         break;
 
-    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
-        /* Source EA is not encoded via ModRM. */
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
-                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
-            goto done;
-        break;
-
-    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
-        /* Destination EA is not encoded via ModRM. */
-        dst.type  = OP_MEM;
-        dst.mem.seg = ea.mem.seg;
-        dst.mem.off = insn_fetch_bytes(ad_bytes);
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        dst.val   = (unsigned long)_regs.eax;
-        break;
-
     case 0xa4 ... 0xa5: /* movs */ {
         unsigned long nr_reps = get_rep_prefix();
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
@@ -2840,9 +2878,6 @@ x86_emulate(
         break;
 
     case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
-        if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */
-            src.val = ((uint32_t)src.val |
-                       ((uint64_t)insn_fetch_type(uint32_t) << 32));
         dst.reg = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
         dst.val = src.val;
@@ -2906,7 +2941,7 @@ x86_emulate(
         goto les;
 
     case 0xc8: /* enter imm16,imm8 */ {
-        uint8_t depth = insn_fetch_type(uint8_t) & 31;
+        uint8_t depth = imm2 & 31;
         int i;
 
         dst.type = OP_REG;
@@ -3627,17 +3662,12 @@ x86_emulate(
         jmp_rel((int32_t)src.val);
         break;
 
-    case 0xea: /* jmp (far, absolute) */ {
-        uint16_t sel;
-        uint32_t eip;
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-        if ( (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (rc = commit_far_branch(&cs, eip)) )
+    case 0xea: /* jmp (far, absolute) */
+        ASSERT(!mode_64bit());
+        if ( (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (rc = commit_far_branch(&cs, imm1)) )
             goto done;
         break;
-    }
 
     case 0xf1: /* int1 (icebp) */
         src.val = EXC_DB;



[-- Attachment #2: x86emul-decode-base.patch --]
[-- Type: text/plain, Size: 6723 bytes --]

x86emul: fetch all insn bytes during the decode phase

This way we can offer to callers the service of just sizing
instructions, and we also can better guarantee not to raise the wrong
fault due to not having read all relevant bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -129,8 +129,8 @@ static const opcode_desc_t opcode_table[
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, ImplicitOps,
     /* 0xA0 - 0xA7 */
-    ByteOp|DstEax|SrcImplicit|Mov, DstEax|SrcImplicit|Mov,
-    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|DstEax|SrcMem|Mov, DstEax|SrcMem|Mov,
+    ByteOp|DstMem|SrcEax|Mov, DstMem|SrcEax|Mov,
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps, ImplicitOps,
     /* 0xA8 - 0xAF */
@@ -1602,6 +1602,45 @@ struct x86_emulate_state {
 #define _regs (state->regs)
 
 static int
+x86_decode_base(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x9a: /* call (far, absolute) */
+    case 0xea: /* jmp (far, absolute) */
+        generate_exception_if(mode_64bit(), EXC_UD, -1);
+
+        imm1 = insn_fetch_bytes(op_bytes);
+        imm2 = insn_fetch_type(uint16_t);
+        break;
+
+    case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
+        /* Source EA is not encoded via ModRM. */
+        ea.mem.off = insn_fetch_bytes(ad_bytes);
+        break;
+
+    case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
+        if ( op_bytes == 8 ) /* Fetch more bytes to obtain imm64. */
+            imm1 = ((uint32_t)imm1 |
+                    ((uint64_t)insn_fetch_type(uint32_t) << 32));
+        break;
+
+    case 0xc8: /* enter imm16,imm8 */
+        imm2 = insn_fetch_type(uint8_t);
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -1994,10 +2033,29 @@ x86_decode(
     state->opcode = b;
     state->desc = d;
 
+    switch ( ext )
+    {
+    case ext_none:
+        rc = x86_decode_base(state, ctxt, ops);
+        break;
+
+    case ext_0f:
+    case ext_0f38:
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
  done:
     return rc;
 }
 
+/* No insn fetching past this point. */
+#undef insn_fetch_bytes
+#undef insn_fetch_type
+
 int
 x86_emulate(
     struct x86_emulate_ctxt *ctxt,
@@ -2560,6 +2618,8 @@ x86_emulate(
     case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
         generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1);
     case 0x88 ... 0x8b: /* mov */
+    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         dst.val = src.val;
         break;
 
@@ -2644,18 +2704,13 @@ x86_emulate(
 
     case 0x9a: /* call (far, absolute) */ {
         struct segment_register reg;
-        uint16_t sel;
-        uint32_t eip;
 
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
+        ASSERT(!mode_64bit());
         fail_if(ops->read_segment == NULL);
 
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-
         if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
-             (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (validate_far_branch(&cs, eip),
+             (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (validate_far_branch(&cs, imm1),
               rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                               &reg.sel, op_bytes, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
@@ -2663,7 +2718,7 @@ x86_emulate(
              (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) )
             goto done;
 
-        _regs.eip = eip;
+        _regs.eip = imm1;
         break;
     }
 
@@ -2706,23 +2761,6 @@ x86_emulate(
         ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02;
         break;
 
-    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
-        /* Source EA is not encoded via ModRM. */
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
-                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
-            goto done;
-        break;
-
-    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
-        /* Destination EA is not encoded via ModRM. */
-        dst.type  = OP_MEM;
-        dst.mem.seg = ea.mem.seg;
-        dst.mem.off = insn_fetch_bytes(ad_bytes);
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        dst.val   = (unsigned long)_regs.eax;
-        break;
-
     case 0xa4 ... 0xa5: /* movs */ {
         unsigned long nr_reps = get_rep_prefix();
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
@@ -2840,9 +2878,6 @@ x86_emulate(
         break;
 
     case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
-        if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */
-            src.val = ((uint32_t)src.val |
-                       ((uint64_t)insn_fetch_type(uint32_t) << 32));
         dst.reg = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
         dst.val = src.val;
@@ -2906,7 +2941,7 @@ x86_emulate(
         goto les;
 
     case 0xc8: /* enter imm16,imm8 */ {
-        uint8_t depth = insn_fetch_type(uint8_t) & 31;
+        uint8_t depth = imm2 & 31;
         int i;
 
         dst.type = OP_REG;
@@ -3627,17 +3662,12 @@ x86_emulate(
         jmp_rel((int32_t)src.val);
         break;
 
-    case 0xea: /* jmp (far, absolute) */ {
-        uint16_t sel;
-        uint32_t eip;
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-        if ( (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (rc = commit_far_branch(&cs, eip)) )
+    case 0xea: /* jmp (far, absolute) */
+        ASSERT(!mode_64bit());
+        if ( (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (rc = commit_far_branch(&cs, imm1)) )
             goto done;
         break;
-    }
 
     case 0xf1: /* int1 (icebp) */
         src.val = EXC_DB;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 04/17] x86emul: track only rIP in emulator state
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
  2016-09-08 13:04 ` [PATCH 01/17] x86emul: split instruction decoding from execution Jan Beulich
  2016-09-08 13:07 ` [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase Jan Beulich
@ 2016-09-08 13:08 ` Jan Beulich
  2016-09-08 13:23   ` Jan Beulich
  2016-09-08 13:09 ` [PATCH 03/17] " Jan Beulich
                   ` (14 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:08 UTC (permalink / raw)
  To: xen-devel, Jan Beulich; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7943 bytes --]

Now that all decoding happens in x86_decode() there's no need to keep
the local registers copy in struct x86_emulate_state. Only rIP gets
updated in the decode phase, so only that register needs tracking
there. All other (read-only) registers can be read from the original
structure (but sadly, due to it getting passed to decode_register(),
the pointer can't be made point to "const" to make the compiler help
ensure no modification happens).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -590,9 +590,9 @@ do{ asm volatile (
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x = 0, _eip = _regs.eip;                              \
-   _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip -                          \
+({ unsigned long _x = 0, _eip = state->eip;                             \
+   state->eip += (_size); /* real hardware doesn't truncate */          \
+   generate_exception_if((uint8_t)(state->eip -                         \
                                    ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
@@ -1582,8 +1582,8 @@ struct x86_emulate_state {
 #define imm1 ea.val
 #define imm2 ea.orig_val
 
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs regs;
+    unsigned long eip;
+    struct cpu_user_regs *regs;
 };
 
 /* Helper definitions. */
@@ -1599,7 +1599,6 @@ struct x86_emulate_state {
 #define vex (state->vex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
-#define _regs (state->regs)
 
 static int
 x86_decode_base(
@@ -1655,7 +1654,8 @@ x86_decode(
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
-    _regs = *ctxt->regs;
+    state->regs = ctxt->regs;
+    state->eip = ctxt->regs->eip;
 
     ctxt->retire.byte = 0;
 
@@ -1759,7 +1759,7 @@ x86_decode(
             default:
                 BUG();
             case 2:
-                if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) )
+                if ( in_realmode(ctxt, ops) || (state->regs->eflags & EFLG_VM) )
                     break;
                 /* fall through */
             case 4:
@@ -1885,7 +1885,7 @@ x86_decode(
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
             ea.reg  = decode_register(
-                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
+                modrm_rm, state->regs, (d & ByteOp) && (rex_prefix == 0));
         }
         else if ( ad_bytes == 2 )
         {
@@ -1893,33 +1893,33 @@ x86_decode(
             switch ( modrm_rm )
             {
             case 0:
-                ea.mem.off = _regs.ebx + _regs.esi;
+                ea.mem.off = state->regs->ebx + state->regs->esi;
                 break;
             case 1:
-                ea.mem.off = _regs.ebx + _regs.edi;
+                ea.mem.off = state->regs->ebx + state->regs->edi;
                 break;
             case 2:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.esi;
+                ea.mem.off = state->regs->ebp + state->regs->esi;
                 break;
             case 3:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.edi;
+                ea.mem.off = state->regs->ebp + state->regs->edi;
                 break;
             case 4:
-                ea.mem.off = _regs.esi;
+                ea.mem.off = state->regs->esi;
                 break;
             case 5:
-                ea.mem.off = _regs.edi;
+                ea.mem.off = state->regs->edi;
                 break;
             case 6:
                 if ( modrm_mod == 0 )
                     break;
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp;
+                ea.mem.off = state->regs->ebp;
                 break;
             case 7:
-                ea.mem.off = _regs.ebx;
+                ea.mem.off = state->regs->ebx;
                 break;
             }
             switch ( modrm_mod )
@@ -1946,14 +1946,15 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 )
-                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                    ea.mem.off = *(long *)decode_register(sib_index,
+                                                          state->regs, 0);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.esp;
+                    ea.mem.off += state->regs->esp;
                     if ( !ext && (b == 0x8f) )
                         /* POP <rm> computes its EA post increment. */
                         ea.mem.off += ((mode_64bit() && (op_bytes == 4))
@@ -1962,15 +1963,17 @@ x86_decode(
                 else if ( sib_base == 5 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.ebp;
+                    ea.mem.off += state->regs->ebp;
                 }
                 else
-                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+                    ea.mem.off += *(long *)decode_register(sib_base,
+                                                           state->regs, 0);
             }
             else
             {
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                ea.mem.off = *(long *)decode_register(modrm_rm,
+                                                      state->regs, 0);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -1983,7 +1986,7 @@ x86_decode(
                 if ( !mode_64bit() )
                     break;
                 /* Relative to RIP of next instruction. Argh! */
-                ea.mem.off += _regs.eip;
+                ea.mem.off += state->eip;
                 if ( (d & SrcMask) == SrcImm )
                     ea.mem.off += (d & ByteOp) ? 1 :
                         ((op_bytes == 8) ? 4 : op_bytes);
@@ -2061,6 +2064,8 @@ x86_emulate(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
@@ -2074,10 +2079,21 @@ x86_emulate(
     if ( rc != X86EMUL_OKAY)
         return rc;
 
+    /* Sync rIP to post decode value. */
+    _regs.eip = state.eip;
+
     b = state.opcode;
     d = state.desc;
 #define state (&state)
 
+    /* Re-vector ea's register pointer into our shadow registers. */
+    if ( ea.type == OP_REG )
+    {
+        unsigned int offs = (void *)ea.reg - (void *)state->regs;
+
+        ea.reg = (void *)&_regs + offs;
+    }
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -5003,4 +5019,3 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
-#undef _regs



[-- Attachment #2: x86emul-decode-regs-pointer.patch --]
[-- Type: text/plain, Size: 7984 bytes --]

x86emul: track only rIP in emulator state

Now that all decoding happens in x86_decode() there's no need to keep
the local registers copy in struct x86_emulate_state. Only rIP gets
updated in the decode phase, so only that register needs tracking
there. All other (read-only) registers can be read from the original
structure (but sadly, due to it getting passed to decode_register(),
the pointer can't be made point to "const" to make the compiler help
ensure no modification happens).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -590,9 +590,9 @@ do{ asm volatile (
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x = 0, _eip = _regs.eip;                              \
-   _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip -                          \
+({ unsigned long _x = 0, _eip = state->eip;                             \
+   state->eip += (_size); /* real hardware doesn't truncate */          \
+   generate_exception_if((uint8_t)(state->eip -                         \
                                    ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
@@ -1582,8 +1582,8 @@ struct x86_emulate_state {
 #define imm1 ea.val
 #define imm2 ea.orig_val
 
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs regs;
+    unsigned long eip;
+    struct cpu_user_regs *regs;
 };
 
 /* Helper definitions. */
@@ -1599,7 +1599,6 @@ struct x86_emulate_state {
 #define vex (state->vex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
-#define _regs (state->regs)
 
 static int
 x86_decode_base(
@@ -1655,7 +1654,8 @@ x86_decode(
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
-    _regs = *ctxt->regs;
+    state->regs = ctxt->regs;
+    state->eip = ctxt->regs->eip;
 
     ctxt->retire.byte = 0;
 
@@ -1759,7 +1759,7 @@ x86_decode(
             default:
                 BUG();
             case 2:
-                if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) )
+                if ( in_realmode(ctxt, ops) || (state->regs->eflags & EFLG_VM) )
                     break;
                 /* fall through */
             case 4:
@@ -1885,7 +1885,7 @@ x86_decode(
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
             ea.reg  = decode_register(
-                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
+                modrm_rm, state->regs, (d & ByteOp) && (rex_prefix == 0));
         }
         else if ( ad_bytes == 2 )
         {
@@ -1893,33 +1893,33 @@ x86_decode(
             switch ( modrm_rm )
             {
             case 0:
-                ea.mem.off = _regs.ebx + _regs.esi;
+                ea.mem.off = state->regs->ebx + state->regs->esi;
                 break;
             case 1:
-                ea.mem.off = _regs.ebx + _regs.edi;
+                ea.mem.off = state->regs->ebx + state->regs->edi;
                 break;
             case 2:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.esi;
+                ea.mem.off = state->regs->ebp + state->regs->esi;
                 break;
             case 3:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.edi;
+                ea.mem.off = state->regs->ebp + state->regs->edi;
                 break;
             case 4:
-                ea.mem.off = _regs.esi;
+                ea.mem.off = state->regs->esi;
                 break;
             case 5:
-                ea.mem.off = _regs.edi;
+                ea.mem.off = state->regs->edi;
                 break;
             case 6:
                 if ( modrm_mod == 0 )
                     break;
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp;
+                ea.mem.off = state->regs->ebp;
                 break;
             case 7:
-                ea.mem.off = _regs.ebx;
+                ea.mem.off = state->regs->ebx;
                 break;
             }
             switch ( modrm_mod )
@@ -1946,14 +1946,15 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 )
-                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                    ea.mem.off = *(long *)decode_register(sib_index,
+                                                          state->regs, 0);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.esp;
+                    ea.mem.off += state->regs->esp;
                     if ( !ext && (b == 0x8f) )
                         /* POP <rm> computes its EA post increment. */
                         ea.mem.off += ((mode_64bit() && (op_bytes == 4))
@@ -1962,15 +1963,17 @@ x86_decode(
                 else if ( sib_base == 5 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.ebp;
+                    ea.mem.off += state->regs->ebp;
                 }
                 else
-                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+                    ea.mem.off += *(long *)decode_register(sib_base,
+                                                           state->regs, 0);
             }
             else
             {
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                ea.mem.off = *(long *)decode_register(modrm_rm,
+                                                      state->regs, 0);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -1983,7 +1986,7 @@ x86_decode(
                 if ( !mode_64bit() )
                     break;
                 /* Relative to RIP of next instruction. Argh! */
-                ea.mem.off += _regs.eip;
+                ea.mem.off += state->eip;
                 if ( (d & SrcMask) == SrcImm )
                     ea.mem.off += (d & ByteOp) ? 1 :
                         ((op_bytes == 8) ? 4 : op_bytes);
@@ -2061,6 +2064,8 @@ x86_emulate(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
@@ -2074,10 +2079,21 @@ x86_emulate(
     if ( rc != X86EMUL_OKAY)
         return rc;
 
+    /* Sync rIP to post decode value. */
+    _regs.eip = state.eip;
+
     b = state.opcode;
     d = state.desc;
 #define state (&state)
 
+    /* Re-vector ea's register pointer into our shadow registers. */
+    if ( ea.type == OP_REG )
+    {
+        unsigned int offs = (void *)ea.reg - (void *)state->regs;
+
+        ea.reg = (void *)&_regs + offs;
+    }
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -5003,4 +5019,3 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
-#undef _regs

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 03/17] x86emul: track only rIP in emulator state
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (2 preceding siblings ...)
  2016-09-08 13:08 ` [PATCH 04/17] x86emul: track only rIP in emulator state Jan Beulich
@ 2016-09-08 13:09 ` Jan Beulich
  2016-09-13 19:09   ` Andrew Cooper
  2016-09-08 13:10 ` [PATCH 04/17] x86emul: complete decoding of two-byte instructions Jan Beulich
                   ` (13 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:09 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7943 bytes --]

Now that all decoding happens in x86_decode() there's no need to keep
the local registers copy in struct x86_emulate_state. Only rIP gets
updated in the decode phase, so only that register needs tracking
there. All other (read-only) registers can be read from the original
structure (but sadly, due to it getting passed to decode_register(),
the pointer can't be made point to "const" to make the compiler help
ensure no modification happens).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -590,9 +590,9 @@ do{ asm volatile (
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x = 0, _eip = _regs.eip;                              \
-   _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip -                          \
+({ unsigned long _x = 0, _eip = state->eip;                             \
+   state->eip += (_size); /* real hardware doesn't truncate */          \
+   generate_exception_if((uint8_t)(state->eip -                         \
                                    ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
@@ -1582,8 +1582,8 @@ struct x86_emulate_state {
 #define imm1 ea.val
 #define imm2 ea.orig_val
 
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs regs;
+    unsigned long eip;
+    struct cpu_user_regs *regs;
 };
 
 /* Helper definitions. */
@@ -1599,7 +1599,6 @@ struct x86_emulate_state {
 #define vex (state->vex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
-#define _regs (state->regs)
 
 static int
 x86_decode_base(
@@ -1655,7 +1654,8 @@ x86_decode(
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
-    _regs = *ctxt->regs;
+    state->regs = ctxt->regs;
+    state->eip = ctxt->regs->eip;
 
     ctxt->retire.byte = 0;
 
@@ -1759,7 +1759,7 @@ x86_decode(
             default:
                 BUG();
             case 2:
-                if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) )
+                if ( in_realmode(ctxt, ops) || (state->regs->eflags & EFLG_VM) )
                     break;
                 /* fall through */
             case 4:
@@ -1885,7 +1885,7 @@ x86_decode(
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
             ea.reg  = decode_register(
-                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
+                modrm_rm, state->regs, (d & ByteOp) && (rex_prefix == 0));
         }
         else if ( ad_bytes == 2 )
         {
@@ -1893,33 +1893,33 @@ x86_decode(
             switch ( modrm_rm )
             {
             case 0:
-                ea.mem.off = _regs.ebx + _regs.esi;
+                ea.mem.off = state->regs->ebx + state->regs->esi;
                 break;
             case 1:
-                ea.mem.off = _regs.ebx + _regs.edi;
+                ea.mem.off = state->regs->ebx + state->regs->edi;
                 break;
             case 2:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.esi;
+                ea.mem.off = state->regs->ebp + state->regs->esi;
                 break;
             case 3:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.edi;
+                ea.mem.off = state->regs->ebp + state->regs->edi;
                 break;
             case 4:
-                ea.mem.off = _regs.esi;
+                ea.mem.off = state->regs->esi;
                 break;
             case 5:
-                ea.mem.off = _regs.edi;
+                ea.mem.off = state->regs->edi;
                 break;
             case 6:
                 if ( modrm_mod == 0 )
                     break;
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp;
+                ea.mem.off = state->regs->ebp;
                 break;
             case 7:
-                ea.mem.off = _regs.ebx;
+                ea.mem.off = state->regs->ebx;
                 break;
             }
             switch ( modrm_mod )
@@ -1946,14 +1946,15 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 )
-                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                    ea.mem.off = *(long *)decode_register(sib_index,
+                                                          state->regs, 0);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.esp;
+                    ea.mem.off += state->regs->esp;
                     if ( !ext && (b == 0x8f) )
                         /* POP <rm> computes its EA post increment. */
                         ea.mem.off += ((mode_64bit() && (op_bytes == 4))
@@ -1962,15 +1963,17 @@ x86_decode(
                 else if ( sib_base == 5 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.ebp;
+                    ea.mem.off += state->regs->ebp;
                 }
                 else
-                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+                    ea.mem.off += *(long *)decode_register(sib_base,
+                                                           state->regs, 0);
             }
             else
             {
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                ea.mem.off = *(long *)decode_register(modrm_rm,
+                                                      state->regs, 0);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -1983,7 +1986,7 @@ x86_decode(
                 if ( !mode_64bit() )
                     break;
                 /* Relative to RIP of next instruction. Argh! */
-                ea.mem.off += _regs.eip;
+                ea.mem.off += state->eip;
                 if ( (d & SrcMask) == SrcImm )
                     ea.mem.off += (d & ByteOp) ? 1 :
                         ((op_bytes == 8) ? 4 : op_bytes);
@@ -2061,6 +2064,8 @@ x86_emulate(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
@@ -2074,10 +2079,21 @@ x86_emulate(
     if ( rc != X86EMUL_OKAY)
         return rc;
 
+    /* Sync rIP to post decode value. */
+    _regs.eip = state.eip;
+
     b = state.opcode;
     d = state.desc;
 #define state (&state)
 
+    /* Re-vector ea's register pointer into our shadow registers. */
+    if ( ea.type == OP_REG )
+    {
+        unsigned int offs = (void *)ea.reg - (void *)state->regs;
+
+        ea.reg = (void *)&_regs + offs;
+    }
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -5003,4 +5019,3 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
-#undef _regs



[-- Attachment #2: x86emul-decode-regs-pointer.patch --]
[-- Type: text/plain, Size: 7984 bytes --]

x86emul: track only rIP in emulator state

Now that all decoding happens in x86_decode() there's no need to keep
the local registers copy in struct x86_emulate_state. Only rIP gets
updated in the decode phase, so only that register needs tracking
there. All other (read-only) registers can be read from the original
structure (but sadly, due to it getting passed to decode_register(),
the pointer can't be made point to "const" to make the compiler help
ensure no modification happens).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -590,9 +590,9 @@ do{ asm volatile (
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x = 0, _eip = _regs.eip;                              \
-   _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip -                          \
+({ unsigned long _x = 0, _eip = state->eip;                             \
+   state->eip += (_size); /* real hardware doesn't truncate */          \
+   generate_exception_if((uint8_t)(state->eip -                         \
                                    ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
@@ -1582,8 +1582,8 @@ struct x86_emulate_state {
 #define imm1 ea.val
 #define imm2 ea.orig_val
 
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs regs;
+    unsigned long eip;
+    struct cpu_user_regs *regs;
 };
 
 /* Helper definitions. */
@@ -1599,7 +1599,6 @@ struct x86_emulate_state {
 #define vex (state->vex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
-#define _regs (state->regs)
 
 static int
 x86_decode_base(
@@ -1655,7 +1654,8 @@ x86_decode(
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
-    _regs = *ctxt->regs;
+    state->regs = ctxt->regs;
+    state->eip = ctxt->regs->eip;
 
     ctxt->retire.byte = 0;
 
@@ -1759,7 +1759,7 @@ x86_decode(
             default:
                 BUG();
             case 2:
-                if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) )
+                if ( in_realmode(ctxt, ops) || (state->regs->eflags & EFLG_VM) )
                     break;
                 /* fall through */
             case 4:
@@ -1885,7 +1885,7 @@ x86_decode(
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
             ea.reg  = decode_register(
-                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
+                modrm_rm, state->regs, (d & ByteOp) && (rex_prefix == 0));
         }
         else if ( ad_bytes == 2 )
         {
@@ -1893,33 +1893,33 @@ x86_decode(
             switch ( modrm_rm )
             {
             case 0:
-                ea.mem.off = _regs.ebx + _regs.esi;
+                ea.mem.off = state->regs->ebx + state->regs->esi;
                 break;
             case 1:
-                ea.mem.off = _regs.ebx + _regs.edi;
+                ea.mem.off = state->regs->ebx + state->regs->edi;
                 break;
             case 2:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.esi;
+                ea.mem.off = state->regs->ebp + state->regs->esi;
                 break;
             case 3:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.edi;
+                ea.mem.off = state->regs->ebp + state->regs->edi;
                 break;
             case 4:
-                ea.mem.off = _regs.esi;
+                ea.mem.off = state->regs->esi;
                 break;
             case 5:
-                ea.mem.off = _regs.edi;
+                ea.mem.off = state->regs->edi;
                 break;
             case 6:
                 if ( modrm_mod == 0 )
                     break;
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp;
+                ea.mem.off = state->regs->ebp;
                 break;
             case 7:
-                ea.mem.off = _regs.ebx;
+                ea.mem.off = state->regs->ebx;
                 break;
             }
             switch ( modrm_mod )
@@ -1946,14 +1946,15 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 )
-                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                    ea.mem.off = *(long *)decode_register(sib_index,
+                                                          state->regs, 0);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.esp;
+                    ea.mem.off += state->regs->esp;
                     if ( !ext && (b == 0x8f) )
                         /* POP <rm> computes its EA post increment. */
                         ea.mem.off += ((mode_64bit() && (op_bytes == 4))
@@ -1962,15 +1963,17 @@ x86_decode(
                 else if ( sib_base == 5 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.ebp;
+                    ea.mem.off += state->regs->ebp;
                 }
                 else
-                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+                    ea.mem.off += *(long *)decode_register(sib_base,
+                                                           state->regs, 0);
             }
             else
             {
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                ea.mem.off = *(long *)decode_register(modrm_rm,
+                                                      state->regs, 0);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -1983,7 +1986,7 @@ x86_decode(
                 if ( !mode_64bit() )
                     break;
                 /* Relative to RIP of next instruction. Argh! */
-                ea.mem.off += _regs.eip;
+                ea.mem.off += state->eip;
                 if ( (d & SrcMask) == SrcImm )
                     ea.mem.off += (d & ByteOp) ? 1 :
                         ((op_bytes == 8) ? 4 : op_bytes);
@@ -2061,6 +2064,8 @@ x86_emulate(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
@@ -2074,10 +2079,21 @@ x86_emulate(
     if ( rc != X86EMUL_OKAY)
         return rc;
 
+    /* Sync rIP to post decode value. */
+    _regs.eip = state.eip;
+
     b = state.opcode;
     d = state.desc;
 #define state (&state)
 
+    /* Re-vector ea's register pointer into our shadow registers. */
+    if ( ea.type == OP_REG )
+    {
+        unsigned int offs = (void *)ea.reg - (void *)state->regs;
+
+        ea.reg = (void *)&_regs + offs;
+    }
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -5003,4 +5019,3 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
-#undef _regs

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (3 preceding siblings ...)
  2016-09-08 13:09 ` [PATCH 03/17] " Jan Beulich
@ 2016-09-08 13:10 ` Jan Beulich
  2016-09-14 14:22   ` Andrew Cooper
  2016-09-08 13:11 ` [PATCH 05/17] x86emul: add XOP decoding Jan Beulich
                   ` (12 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:10 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 9496 bytes --]

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

This at once adds correct raising of #UD for the three "ud<n>" flavors
(Intel names only "ud2", but AMD names all three of them in their
opcode maps), as that may make a difference to callers compared to
getting back X86EMUL_UNHANDLEABLE.

Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
which have a ModRM like byte where only register forms are valid. I.e.
we could also use SrcImmByte there, but ModRM is more likely to be
correct for a hypothetical extension allowing non-register operations.

Note on opcode 0FB8: I think we're safe to ignore JMPE (which doesn't
take a ModRM byte, but an immediate).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -182,11 +182,14 @@ static const opcode_desc_t opcode_table[
 
 static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
-    SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
+    SrcMem16|ModRM, ImplicitOps|ModRM, ModRM, ModRM,
+    0, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0x08 - 0x0F */
-    ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
+    0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
     /* 0x10 - 0x17 */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x18 - 0x1F */
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
@@ -194,12 +197,13 @@ static const opcode_desc_t twobyte_table
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     0, 0, 0, 0,
     /* 0x28 - 0x2F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, ImplicitOps|ModRM, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x30 - 0x37 */
-    ImplicitOps, ImplicitOps, ImplicitOps, 0,
-    ImplicitOps, ImplicitOps, 0, 0,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
     /* 0x38 - 0x3F */
-    DstReg|SrcMem|ModRM, 0, 0, 0, 0, 0, 0, 0,
+    DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
     /* 0x40 - 0x47 */
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
@@ -211,11 +215,15 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     /* 0x50 - 0x5F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0x60 - 0x6F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x70 - 0x7F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
+    ModRM, ModRM, ModRM, ImplicitOps,
+    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     DstImplicit|SrcImm, DstImplicit|SrcImm,
     DstImplicit|SrcImm, DstImplicit|SrcImm,
@@ -238,9 +246,9 @@ static const opcode_desc_t twobyte_table
     ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
     /* 0xA0 - 0xA7 */
     ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
     /* 0xA8 - 0xAF */
-    ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
+    ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
     DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
     ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
     /* 0xB0 - 0xB7 */
@@ -249,22 +257,26 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xB8 - 0xBF */
-    0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
+    DstReg|SrcMem|ModRM, ModRM,
+    DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
     DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xC0 - 0xC7 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    0, DstMem|SrcReg|ModRM|Mov,
-    0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
     /* 0xC8 - 0xCF */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xDF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xE0 - 0xEF */
-    0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xF0 - 0xFF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
 #define REX_PREFIX 0x40
@@ -1563,7 +1575,12 @@ int x86emul_unhandleable_rw(
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
-    enum { ext_none, ext_0f, ext_0f38 } ext;
+    enum {
+        ext_none = vex_none,
+        ext_0f   = vex_0f,
+        ext_0f38 = vex_0f38,
+        ext_0f3a = vex_0f3a,
+    } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
@@ -1640,6 +1657,34 @@ x86_decode_base(
 }
 
 static int
+x86_decode_twobyte(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x78:
+        if ( vex.opcx )
+            break;
+        switch ( vex.pfx )
+        {
+        case vex_66: /* extrq $imm8, $imm8, xmm */
+        case vex_f2: /* insertq $imm8, $imm8, xmm, xmm */
+            imm1 = insn_fetch_type(uint8_t);
+            imm2 = insn_fetch_type(uint8_t);
+            break;
+        }
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -1743,6 +1788,10 @@ x86_decode(
                 b = insn_fetch_type(uint8_t);
                 ext = ext_0f38;
                 break;
+            case 0x3a:
+                b = insn_fetch_type(uint8_t);
+                ext = ext_0f3a;
+                break;
             }
         }
     }
@@ -1798,10 +1847,22 @@ x86_decode(
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
-                fail_if(vex.opcx != vex_0f);
-                ext = ext_0f;
                 b = insn_fetch_type(uint8_t);
-                d = twobyte_table[b];
+                switch ( ext = vex.opcx )
+                {
+                case vex_0f:
+                    d = twobyte_table[b];
+                    break;
+                case vex_0f38:
+                    d = twobyte_table[0x38];
+                    break;
+                case vex_0f3a:
+                    d = twobyte_table[0x3a];
+                    break;
+                default:
+                    rc = X86EMUL_UNHANDLEABLE;
+                    goto done;
+                }
 
                 modrm = insn_fetch_type(uint8_t);
                 modrm_mod = (modrm & 0xc0) >> 6;
@@ -1859,9 +1920,12 @@ x86_decode(
             break;
 
         case ext_0f:
+        case ext_0f3a:
             break;
 
         case ext_0f38:
+            if ( vex.opcx )
+                break;
             switch ( b )
             {
             case 0xf0: /* movbe / crc32 */
@@ -2043,7 +2107,11 @@ x86_decode(
         break;
 
     case ext_0f:
+        rc = x86_decode_twobyte(state, ctxt, ops);
+        break;
+
     case ext_0f38:
+    case ext_0f3a:
         break;
 
     default:
@@ -2263,6 +2331,7 @@ x86_emulate(
         goto ext_0f38_insn;
     default:
         ASSERT_UNREACHABLE();
+    case ext_0f3a:
         goto cannot_emulate;
     }
 
@@ -4264,6 +4333,11 @@ x86_emulate(
             goto done;
         break;
 
+    case 0x0b: /* ud2 */
+    case 0xb9: /* ud1 */
+    case 0xff: /* ud0 */
+        generate_exception_if(1, EXC_UD, -1);
+
     case 0x0d: /* GrpP (prefetch) */
     case 0x18: /* Grp16 (prefetch/nop) */
     case 0x19 ... 0x1f: /* nop (amd-defined) */



[-- Attachment #2: x86emul-decode-twobyte.patch --]
[-- Type: text/plain, Size: 9547 bytes --]

x86emul: complete decoding of two-byte instructions

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

This at once adds correct raising of #UD for the three "ud<n>" flavors
(Intel names only "ud2", but AMD names all three of them in their
opcode maps), as that may make a difference to callers compared to
getting back X86EMUL_UNHANDLEABLE.

Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
which have a ModRM like byte where only register forms are valid. I.e.
we could also use SrcImmByte there, but ModRM is more likely to be
correct for a hypothetical extension allowing non-register operations.

Note on opcode 0FB8: I think we're safe to ignore JMPE (which doesn't
take a ModRM byte, but an immediate).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -182,11 +182,14 @@ static const opcode_desc_t opcode_table[
 
 static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
-    SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
+    SrcMem16|ModRM, ImplicitOps|ModRM, ModRM, ModRM,
+    0, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0x08 - 0x0F */
-    ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
+    0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
     /* 0x10 - 0x17 */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x18 - 0x1F */
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
@@ -194,12 +197,13 @@ static const opcode_desc_t twobyte_table
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     0, 0, 0, 0,
     /* 0x28 - 0x2F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, ImplicitOps|ModRM, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x30 - 0x37 */
-    ImplicitOps, ImplicitOps, ImplicitOps, 0,
-    ImplicitOps, ImplicitOps, 0, 0,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
     /* 0x38 - 0x3F */
-    DstReg|SrcMem|ModRM, 0, 0, 0, 0, 0, 0, 0,
+    DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
     /* 0x40 - 0x47 */
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
@@ -211,11 +215,15 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     /* 0x50 - 0x5F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0x60 - 0x6F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x70 - 0x7F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
+    ModRM, ModRM, ModRM, ImplicitOps,
+    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     DstImplicit|SrcImm, DstImplicit|SrcImm,
     DstImplicit|SrcImm, DstImplicit|SrcImm,
@@ -238,9 +246,9 @@ static const opcode_desc_t twobyte_table
     ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
     /* 0xA0 - 0xA7 */
     ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
     /* 0xA8 - 0xAF */
-    ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
+    ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
     DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
     ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
     /* 0xB0 - 0xB7 */
@@ -249,22 +257,26 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xB8 - 0xBF */
-    0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
+    DstReg|SrcMem|ModRM, ModRM,
+    DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
     DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xC0 - 0xC7 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    0, DstMem|SrcReg|ModRM|Mov,
-    0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
     /* 0xC8 - 0xCF */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xDF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xE0 - 0xEF */
-    0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xF0 - 0xFF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
 #define REX_PREFIX 0x40
@@ -1563,7 +1575,12 @@ int x86emul_unhandleable_rw(
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
-    enum { ext_none, ext_0f, ext_0f38 } ext;
+    enum {
+        ext_none = vex_none,
+        ext_0f   = vex_0f,
+        ext_0f38 = vex_0f38,
+        ext_0f3a = vex_0f3a,
+    } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
@@ -1640,6 +1657,34 @@ x86_decode_base(
 }
 
 static int
+x86_decode_twobyte(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x78:
+        if ( vex.opcx )
+            break;
+        switch ( vex.pfx )
+        {
+        case vex_66: /* extrq $imm8, $imm8, xmm */
+        case vex_f2: /* insertq $imm8, $imm8, xmm, xmm */
+            imm1 = insn_fetch_type(uint8_t);
+            imm2 = insn_fetch_type(uint8_t);
+            break;
+        }
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -1743,6 +1788,10 @@ x86_decode(
                 b = insn_fetch_type(uint8_t);
                 ext = ext_0f38;
                 break;
+            case 0x3a:
+                b = insn_fetch_type(uint8_t);
+                ext = ext_0f3a;
+                break;
             }
         }
     }
@@ -1798,10 +1847,22 @@ x86_decode(
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
-                fail_if(vex.opcx != vex_0f);
-                ext = ext_0f;
                 b = insn_fetch_type(uint8_t);
-                d = twobyte_table[b];
+                switch ( ext = vex.opcx )
+                {
+                case vex_0f:
+                    d = twobyte_table[b];
+                    break;
+                case vex_0f38:
+                    d = twobyte_table[0x38];
+                    break;
+                case vex_0f3a:
+                    d = twobyte_table[0x3a];
+                    break;
+                default:
+                    rc = X86EMUL_UNHANDLEABLE;
+                    goto done;
+                }
 
                 modrm = insn_fetch_type(uint8_t);
                 modrm_mod = (modrm & 0xc0) >> 6;
@@ -1859,9 +1920,12 @@ x86_decode(
             break;
 
         case ext_0f:
+        case ext_0f3a:
             break;
 
         case ext_0f38:
+            if ( vex.opcx )
+                break;
             switch ( b )
             {
             case 0xf0: /* movbe / crc32 */
@@ -2043,7 +2107,11 @@ x86_decode(
         break;
 
     case ext_0f:
+        rc = x86_decode_twobyte(state, ctxt, ops);
+        break;
+
     case ext_0f38:
+    case ext_0f3a:
         break;
 
     default:
@@ -2263,6 +2331,7 @@ x86_emulate(
         goto ext_0f38_insn;
     default:
         ASSERT_UNREACHABLE();
+    case ext_0f3a:
         goto cannot_emulate;
     }
 
@@ -4264,6 +4333,11 @@ x86_emulate(
             goto done;
         break;
 
+    case 0x0b: /* ud2 */
+    case 0xb9: /* ud1 */
+    case 0xff: /* ud0 */
+        generate_exception_if(1, EXC_UD, -1);
+
     case 0x0d: /* GrpP (prefetch) */
     case 0x18: /* Grp16 (prefetch/nop) */
     case 0x19 ... 0x1f: /* nop (amd-defined) */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 05/17] x86emul: add XOP decoding
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (4 preceding siblings ...)
  2016-09-08 13:10 ` [PATCH 04/17] x86emul: complete decoding of two-byte instructions Jan Beulich
@ 2016-09-08 13:11 ` Jan Beulich
  2016-09-14 16:11   ` Andrew Cooper
  2016-09-08 13:12 ` [PATCH 06/17] x86emul: add EVEX decoding Jan Beulich
                   ` (11 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:11 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 3837 bytes --]

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -279,6 +279,12 @@ static const opcode_desc_t twobyte_table
     ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
+static const opcode_desc_t xop_table[] = {
+    DstReg|SrcImmByte|ModRM,
+    DstReg|SrcMem|ModRM,
+    DstReg|SrcImm|ModRM,
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -1580,6 +1586,9 @@ struct x86_emulate_state {
         ext_0f   = vex_0f,
         ext_0f38 = vex_0f38,
         ext_0f3a = vex_0f3a,
+        ext_8f08 = 8,
+        ext_8f09,
+        ext_8f0a,
     } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -1802,7 +1811,7 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1816,11 +1825,11 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX */
+                /* VEX / XOP */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
-                if ( b & 1 )
+                if ( b == 0xc5 )
                 {
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
@@ -1848,18 +1857,30 @@ x86_decode(
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
-                switch ( ext = vex.opcx )
+                ext = vex.opcx;
+                if ( b != 0x8f )
+                {
+                    switch ( ext )
+                    {
+                    case vex_0f:
+                        d = twobyte_table[b];
+                        break;
+                    case vex_0f38:
+                        d = twobyte_table[0x38];
+                        break;
+                    case vex_0f3a:
+                        d = twobyte_table[0x3a];
+                        break;
+                    default:
+                        rc = X86EMUL_UNHANDLEABLE;
+                        goto done;
+                    }
+                }
+                else if ( ext < ext_8f08 +
+                                sizeof(xop_table) / sizeof(*xop_table) )
+                    d = xop_table[ext - ext_8f08];
+                else
                 {
-                case vex_0f:
-                    d = twobyte_table[b];
-                    break;
-                case vex_0f38:
-                    d = twobyte_table[0x38];
-                    break;
-                case vex_0f3a:
-                    d = twobyte_table[0x3a];
-                    break;
-                default:
                     rc = X86EMUL_UNHANDLEABLE;
                     goto done;
                 }
@@ -1921,6 +1942,9 @@ x86_decode(
 
         case ext_0f:
         case ext_0f3a:
+        case ext_8f08:
+        case ext_8f09:
+        case ext_8f0a:
             break;
 
         case ext_0f38:
@@ -2112,6 +2136,9 @@ x86_decode(
 
     case ext_0f38:
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         break;
 
     default:
@@ -2332,6 +2359,9 @@ x86_emulate(
     default:
         ASSERT_UNREACHABLE();
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         goto cannot_emulate;
     }
 




[-- Attachment #2: x86emul-decode-XOP.patch --]
[-- Type: text/plain, Size: 3860 bytes --]

x86emul: add XOP decoding

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -279,6 +279,12 @@ static const opcode_desc_t twobyte_table
     ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
+static const opcode_desc_t xop_table[] = {
+    DstReg|SrcImmByte|ModRM,
+    DstReg|SrcMem|ModRM,
+    DstReg|SrcImm|ModRM,
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -1580,6 +1586,9 @@ struct x86_emulate_state {
         ext_0f   = vex_0f,
         ext_0f38 = vex_0f38,
         ext_0f3a = vex_0f3a,
+        ext_8f08 = 8,
+        ext_8f09,
+        ext_8f0a,
     } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -1802,7 +1811,7 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1816,11 +1825,11 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX */
+                /* VEX / XOP */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
-                if ( b & 1 )
+                if ( b == 0xc5 )
                 {
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
@@ -1848,18 +1857,30 @@ x86_decode(
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
-                switch ( ext = vex.opcx )
+                ext = vex.opcx;
+                if ( b != 0x8f )
+                {
+                    switch ( ext )
+                    {
+                    case vex_0f:
+                        d = twobyte_table[b];
+                        break;
+                    case vex_0f38:
+                        d = twobyte_table[0x38];
+                        break;
+                    case vex_0f3a:
+                        d = twobyte_table[0x3a];
+                        break;
+                    default:
+                        rc = X86EMUL_UNHANDLEABLE;
+                        goto done;
+                    }
+                }
+                else if ( ext < ext_8f08 +
+                                sizeof(xop_table) / sizeof(*xop_table) )
+                    d = xop_table[ext - ext_8f08];
+                else
                 {
-                case vex_0f:
-                    d = twobyte_table[b];
-                    break;
-                case vex_0f38:
-                    d = twobyte_table[0x38];
-                    break;
-                case vex_0f3a:
-                    d = twobyte_table[0x3a];
-                    break;
-                default:
                     rc = X86EMUL_UNHANDLEABLE;
                     goto done;
                 }
@@ -1921,6 +1942,9 @@ x86_decode(
 
         case ext_0f:
         case ext_0f3a:
+        case ext_8f08:
+        case ext_8f09:
+        case ext_8f0a:
             break;
 
         case ext_0f38:
@@ -2112,6 +2136,9 @@ x86_decode(
 
     case ext_0f38:
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         break;
 
     default:
@@ -2332,6 +2359,9 @@ x86_emulate(
     default:
         ASSERT_UNREACHABLE();
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         goto cannot_emulate;
     }
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 06/17] x86emul: add EVEX decoding
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (5 preceding siblings ...)
  2016-09-08 13:11 ` [PATCH 05/17] x86emul: add XOP decoding Jan Beulich
@ 2016-09-08 13:12 ` Jan Beulich
  2016-09-14 17:05   ` Andrew Cooper
  2016-09-08 13:13 ` [PATCH 07/17] x86emul: move x86_execute() common epilogue code Jan Beulich
                   ` (10 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:12 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 2957 bytes --]

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: I'm kind of undecided whether to right away propagate evex.R into
     modrm_reg (and then also deal with the new meaning of evex.x for
     modrm_rm). Since that doesn't affect GPRs (and the extra bits
     would need masking off when accessing GPRs) I've left this out for
     now.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -336,6 +336,27 @@ union vex {
         ptr[1] = rex | REX_PREFIX; \
 } while (0)
 
+union evex {
+    uint8_t raw[3];
+    struct {
+        uint8_t opcx:2;
+        uint8_t :2;
+        uint8_t R:1;
+        uint8_t b:1;
+        uint8_t x:1;
+        uint8_t r:1;
+        uint8_t pfx:2;
+        uint8_t evex:1;
+        uint8_t reg:4;
+        uint8_t w:1;
+        uint8_t opmsk:3;
+        uint8_t RX:1;
+        uint8_t bcst:1;
+        uint8_t lr:2;
+        uint8_t z:1;
+    };
+};
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -1596,6 +1617,7 @@ struct x86_emulate_state {
     bool lock_prefix;
     opcode_desc_t desc;
     union vex vex;
+    union evex evex;
     int override_seg;
 
     /*
@@ -1623,6 +1645,7 @@ struct x86_emulate_state {
 #define rex_prefix (state->rex_prefix)
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
+#define evex (state->evex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
 
@@ -1811,7 +1834,8 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18)) ||
+                      b == 0x62) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1825,7 +1849,7 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX / XOP */
+                /* VEX / XOP / EVEX */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
@@ -1852,6 +1876,14 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
+                    if ( b == 0x62 )
+                    {
+                        evex.raw[0] = vex.raw[0];
+                        evex.raw[1] = vex.raw[1];
+                        evex.raw[2] = insn_fetch_type(uint8_t);
+
+                        vex.opcx = evex.opcx;
+                    }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;




[-- Attachment #2: x86emul-decode-EVEX.patch --]
[-- Type: text/plain, Size: 2981 bytes --]

x86emul: add EVEX decoding

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: I'm kind of undecided whether to right away propagate evex.R into
     modrm_reg (and then also deal with the new meaning of evex.x for
     modrm_rm). Since that doesn't affect GPRs (and the extra bits
     would need masking off when accessing GPRs) I've left this out for
     now.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -336,6 +336,27 @@ union vex {
         ptr[1] = rex | REX_PREFIX; \
 } while (0)
 
+union evex {
+    uint8_t raw[3];
+    struct {
+        uint8_t opcx:2;
+        uint8_t :2;
+        uint8_t R:1;
+        uint8_t b:1;
+        uint8_t x:1;
+        uint8_t r:1;
+        uint8_t pfx:2;
+        uint8_t evex:1;
+        uint8_t reg:4;
+        uint8_t w:1;
+        uint8_t opmsk:3;
+        uint8_t RX:1;
+        uint8_t bcst:1;
+        uint8_t lr:2;
+        uint8_t z:1;
+    };
+};
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -1596,6 +1617,7 @@ struct x86_emulate_state {
     bool lock_prefix;
     opcode_desc_t desc;
     union vex vex;
+    union evex evex;
     int override_seg;
 
     /*
@@ -1623,6 +1645,7 @@ struct x86_emulate_state {
 #define rex_prefix (state->rex_prefix)
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
+#define evex (state->evex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
 
@@ -1811,7 +1834,8 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18)) ||
+                      b == 0x62) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1825,7 +1849,7 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX / XOP */
+                /* VEX / XOP / EVEX */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
@@ -1852,6 +1876,14 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
+                    if ( b == 0x62 )
+                    {
+                        evex.raw[0] = vex.raw[0];
+                        evex.raw[1] = vex.raw[1];
+                        evex.raw[2] = insn_fetch_type(uint8_t);
+
+                        vex.opcx = evex.opcx;
+                    }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 07/17] x86emul: move x86_execute() common epilogue code
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (6 preceding siblings ...)
  2016-09-08 13:12 ` [PATCH 06/17] x86emul: add EVEX decoding Jan Beulich
@ 2016-09-08 13:13 ` Jan Beulich
  2016-09-08 13:28   ` Jan Beulich
  2016-09-14 17:13   ` Andrew Cooper
  2016-09-08 13:14 ` [PATCH 08/17] x86emul: generate and make use of canonical opcode representation Jan Beulich
                   ` (9 subsequent siblings)
  17 siblings, 2 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 3783 bytes --]

Only code movement, no functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
This is just to ease review of a later patch.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4111,56 +4111,7 @@ x86_emulate(
     default:
         goto cannot_emulate;
     }
-
- writeback:
-    switch ( dst.type )
-    {
-    case OP_REG:
-        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-        switch ( dst.bytes )
-        {
-        case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
-        case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
-        case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
-        case 8: *dst.reg = dst.val; break;
-        }
-        break;
-    case OP_MEM:
-        if ( !(d & Mov) && (dst.orig_val == dst.val) &&
-             !ctxt->force_writeback )
-            /* nothing to do */;
-        else if ( lock_prefix )
-            rc = ops->cmpxchg(
-                dst.mem.seg, dst.mem.off, &dst.orig_val,
-                &dst.val, dst.bytes, ctxt);
-        else
-            rc = ops->write(
-                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
-        if ( rc != 0 )
-            goto done;
-    default:
-        break;
-    }
-
- no_writeback:
-    /* Inject #DB if single-step tracing was enabled at instruction start. */
-    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
-         (ops->inject_hw_exception != NULL) )
-        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
-
-    /* Commit shadow register state. */
-    _regs.eflags &= ~EFLG_RF;
-
-    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
-    if ( !mode_64bit() )
-        _regs.eip = (uint32_t)_regs.eip;
-
-    *ctxt->regs = _regs;
-
- done:
-    _put_fpu();
-    put_stub(stub);
-    return rc;
+    goto writeback;
 
  ext_0f_insn:
     switch ( b )
@@ -5134,7 +5085,56 @@ x86_emulate(
     default:
         goto cannot_emulate;
     }
-    goto writeback;
+
+ writeback:
+    switch ( dst.type )
+    {
+    case OP_REG:
+        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+        switch ( dst.bytes )
+        {
+        case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
+        case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
+        case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
+        case 8: *dst.reg = dst.val; break;
+        }
+        break;
+    case OP_MEM:
+        if ( !(d & Mov) && (dst.orig_val == dst.val) &&
+             !ctxt->force_writeback )
+            /* nothing to do */;
+        else if ( lock_prefix )
+            rc = ops->cmpxchg(
+                dst.mem.seg, dst.mem.off, &dst.orig_val,
+                &dst.val, dst.bytes, ctxt);
+        else
+            rc = ops->write(
+                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+        if ( rc != 0 )
+            goto done;
+    default:
+        break;
+    }
+
+ no_writeback:
+    /* Inject #DB if single-step tracing was enabled at instruction start. */
+    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
+         (ops->inject_hw_exception != NULL) )
+        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
+
+    /* Commit shadow register state. */
+    _regs.eflags &= ~EFLG_RF;
+
+    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
+    if ( !mode_64bit() )
+        _regs.eip = (uint32_t)_regs.eip;
+
+    *ctxt->regs = _regs;
+
+ done:
+    _put_fpu();
+    put_stub(stub);
+    return rc;
 
  cannot_emulate:
     _put_fpu();




[-- Attachment #2: x86emul-move-writeback.patch --]
[-- Type: text/plain, Size: 3829 bytes --]

x86emul: move x86_execute() common epilogue code

Only code movement, no functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
This is just to ease review of a later patch.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4111,56 +4111,7 @@ x86_emulate(
     default:
         goto cannot_emulate;
     }
-
- writeback:
-    switch ( dst.type )
-    {
-    case OP_REG:
-        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-        switch ( dst.bytes )
-        {
-        case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
-        case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
-        case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
-        case 8: *dst.reg = dst.val; break;
-        }
-        break;
-    case OP_MEM:
-        if ( !(d & Mov) && (dst.orig_val == dst.val) &&
-             !ctxt->force_writeback )
-            /* nothing to do */;
-        else if ( lock_prefix )
-            rc = ops->cmpxchg(
-                dst.mem.seg, dst.mem.off, &dst.orig_val,
-                &dst.val, dst.bytes, ctxt);
-        else
-            rc = ops->write(
-                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
-        if ( rc != 0 )
-            goto done;
-    default:
-        break;
-    }
-
- no_writeback:
-    /* Inject #DB if single-step tracing was enabled at instruction start. */
-    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
-         (ops->inject_hw_exception != NULL) )
-        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
-
-    /* Commit shadow register state. */
-    _regs.eflags &= ~EFLG_RF;
-
-    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
-    if ( !mode_64bit() )
-        _regs.eip = (uint32_t)_regs.eip;
-
-    *ctxt->regs = _regs;
-
- done:
-    _put_fpu();
-    put_stub(stub);
-    return rc;
+    goto writeback;
 
  ext_0f_insn:
     switch ( b )
@@ -5134,7 +5085,56 @@ x86_emulate(
     default:
         goto cannot_emulate;
     }
-    goto writeback;
+
+ writeback:
+    switch ( dst.type )
+    {
+    case OP_REG:
+        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+        switch ( dst.bytes )
+        {
+        case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
+        case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
+        case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
+        case 8: *dst.reg = dst.val; break;
+        }
+        break;
+    case OP_MEM:
+        if ( !(d & Mov) && (dst.orig_val == dst.val) &&
+             !ctxt->force_writeback )
+            /* nothing to do */;
+        else if ( lock_prefix )
+            rc = ops->cmpxchg(
+                dst.mem.seg, dst.mem.off, &dst.orig_val,
+                &dst.val, dst.bytes, ctxt);
+        else
+            rc = ops->write(
+                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+        if ( rc != 0 )
+            goto done;
+    default:
+        break;
+    }
+
+ no_writeback:
+    /* Inject #DB if single-step tracing was enabled at instruction start. */
+    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
+         (ops->inject_hw_exception != NULL) )
+        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
+
+    /* Commit shadow register state. */
+    _regs.eflags &= ~EFLG_RF;
+
+    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
+    if ( !mode_64bit() )
+        _regs.eip = (uint32_t)_regs.eip;
+
+    *ctxt->regs = _regs;
+
+ done:
+    _put_fpu();
+    put_stub(stub);
+    return rc;
 
  cannot_emulate:
     _put_fpu();

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 08/17] x86emul: generate and make use of canonical opcode representation
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (7 preceding siblings ...)
  2016-09-08 13:13 ` [PATCH 07/17] x86emul: move x86_execute() common epilogue code Jan Beulich
@ 2016-09-08 13:14 ` Jan Beulich
  2016-09-14 17:30   ` Andrew Cooper
  2016-09-08 13:14 ` [PATCH 09/17] SVM: use generic instruction decoding Jan Beulich
                   ` (8 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 33541 bytes --]

This representation is then being made available to interested callers,
to facilitate replacing their custom decoding.

This entails combining the three main switch statements into one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -14,6 +14,9 @@ typedef bool bool_t;
 #define ASSERT assert
 #define ASSERT_UNREACHABLE() assert(!__LINE__)
 
+#define MASK_EXTR(v, m) (((v) & (m)) / ((m) & -(m)))
+#define MASK_INSR(v, m) (((v) * ((m) & -(m))) & (m))
+
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1611,7 +1611,6 @@ struct x86_emulate_state {
         ext_8f09,
         ext_8f0a,
     } ext;
-    uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
     bool lock_prefix;
@@ -1657,7 +1656,7 @@ x86_decode_base(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode )
     {
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
@@ -1696,11 +1695,9 @@ x86_decode_twobyte(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
     {
     case 0x78:
-        if ( vex.opcx )
-            break;
         switch ( vex.pfx )
         {
         case vex_66: /* extrq $imm8, $imm8, xmm */
@@ -1709,7 +1706,23 @@ x86_decode_twobyte(
             imm2 = insn_fetch_type(uint8_t);
             break;
         }
-        break;
+        /* fall through */
+    case 0x10 ... 0x18:
+    case 0x28 ... 0x2f:
+    case 0x50 ... 0x77:
+    case 0x79 ... 0x7f:
+    case 0xae:
+    case 0xc2:
+    case 0xc4 ... 0xc7:
+    case 0xd0 ... 0xfe:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+        /* Intentionally not handling here despite being modified by F3:
+    case 0xb8: jmpe / popcnt
+    case 0xbc: bsf / tzcnt
+    case 0xbd: bsr / lzcnt
+         * They're being dealt with in the execution phase (if at all).
+         */
     }
 
  done:
@@ -1717,13 +1730,35 @@ x86_decode_twobyte(
 }
 
 static int
+x86_decode_0f38(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+    {
+    case 0x00 ... 0xef:
+    case 0xf2 ... 0xff:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
+    case 0xf0: case 0xf1: /* movbe / crc32 */
+        if ( rep_prefix() )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
     uint8_t b, d, sib, sib_index, sib_base;
-    unsigned int def_op_bytes, def_ad_bytes;
+    unsigned int def_op_bytes, def_ad_bytes, opcode;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
@@ -1804,29 +1839,31 @@ x86_decode(
 
     /* Opcode byte(s). */
     d = opcode_table[b];
-    if ( d == 0 )
+    if ( d == 0 && b == 0x0f)
     {
-        /* Two-byte opcode? */
-        if ( b == 0x0f )
+        /* Two-byte opcode. */
+        b = insn_fetch_type(uint8_t);
+        d = twobyte_table[b];
+        switch ( b )
         {
+        default:
+            opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f;
+            break;
+        case 0x38:
             b = insn_fetch_type(uint8_t);
-            d = twobyte_table[b];
-            switch ( b )
-            {
-            default:
-                ext = ext_0f;
-                break;
-            case 0x38:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f38;
-                break;
-            case 0x3a:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f3a;
-                break;
-            }
+            opcode = b | MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f38;
+            break;
+        case 0x3a:
+            b = insn_fetch_type(uint8_t);
+            opcode = b | MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f3a;
+            break;
         }
     }
+    else
+        opcode = b;
 
     /* ModRM and SIB bytes. */
     if ( d & ModRM )
@@ -1855,6 +1892,7 @@ x86_decode(
                 vex.raw[0] = modrm;
                 if ( b == 0xc5 )
                 {
+                    opcode = X86EMUL_OPC_VEX_;
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
                     vex.x = 1;
@@ -1876,31 +1914,44 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
-                    if ( b == 0x62 )
+                    switch ( b )
                     {
+                    case 0x62:
+                        opcode = X86EMUL_OPC_EVEX_;
                         evex.raw[0] = vex.raw[0];
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
                         vex.opcx = evex.opcx;
+                        break;
+                    case 0xc4:
+                        opcode = X86EMUL_OPC_VEX_;
+                        break;
+                    default:
+                        opcode = 0;
+                        break;
                     }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
+                opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
                 ext = vex.opcx;
                 if ( b != 0x8f )
                 {
                     switch ( ext )
                     {
                     case vex_0f:
+                        opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[b];
                         break;
                     case vex_0f38:
+                        opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x38];
                         break;
                     case vex_0f3a:
+                        opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x3a];
                         break;
                     default:
@@ -1910,7 +1961,11 @@ x86_decode(
                 }
                 else if ( ext < ext_8f08 +
                                 sizeof(xop_table) / sizeof(*xop_table) )
+                {
+                    opcode |= MASK_INSR(0x8f08 + ext - ext_8f08,
+                                        X86EMUL_OPC_EXT_MASK);
                     d = xop_table[ext - ext_8f08];
+                }
                 else
                 {
                     rc = X86EMUL_UNHANDLEABLE;
@@ -1980,9 +2035,7 @@ x86_decode(
             break;
 
         case ext_0f38:
-            if ( vex.opcx )
-                break;
-            switch ( b )
+            switch ( opcode & X86EMUL_OPC_MASK )
             {
             case 0xf0: /* movbe / crc32 */
                 d |= repne_prefix() ? ByteOp : Mov;
@@ -1991,8 +2044,6 @@ x86_decode(
                 if ( !repne_prefix() )
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
-            default: /* Until it is worth making this table based ... */
-                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -2153,7 +2204,7 @@ x86_decode(
         break;
     }
 
-    state->opcode = b;
+    ctxt->opcode = opcode;
     state->desc = d;
 
     switch ( ext )
@@ -2167,7 +2218,14 @@ x86_decode(
         break;
 
     case ext_0f38:
+        rc = x86_decode_0f38(state, ctxt, ops);
+        break;
+
     case ext_0f3a:
+        if ( !vex.opcx )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
     case ext_8f08:
     case ext_8f09:
     case ext_8f0a:
@@ -2209,7 +2267,7 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
-    b = state.opcode;
+    b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
 
@@ -2380,24 +2438,7 @@ x86_emulate(
         break;
     }
 
-    switch ( ext )
-    {
-    case ext_none:
-        break;
-    case ext_0f:
-        goto ext_0f_insn;
-    case ext_0f38:
-        goto ext_0f38_insn;
-    default:
-        ASSERT_UNREACHABLE();
-    case ext_0f3a:
-    case ext_8f08:
-    case ext_8f09:
-    case ext_8f0a:
-        goto cannot_emulate;
-    }
-
-    switch ( b )
+    switch ( ctxt->opcode )
     {
         struct segment_register cs;
 
@@ -4108,15 +4149,7 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f_insn:
-    switch ( b )
-    {
-    case 0x00: /* Grp6 */
+    case X86EMUL_OPC(0x0f, 0x00): /* Grp6 */
         fail_if((modrm_reg & 6) != 2);
         generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4125,7 +4158,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x01: /* Grp7 */ {
+    case X86EMUL_OPC(0x0f, 0x01): /* Grp7 */ {
         struct segment_register reg;
         unsigned long base, limit, cr0, cr0w;
 
@@ -4270,7 +4303,7 @@ x86_emulate(
         break;
     }
 
-    case 0x05: /* syscall */ {
+    case X86EMUL_OPC(0x0f, 0x05): /* syscall */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
 
@@ -4330,7 +4363,7 @@ x86_emulate(
         break;
     }
 
-    case 0x06: /* clts */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL));
         if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ||
@@ -4338,42 +4371,64 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x08: /* invd */
-    case 0x09: /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x08): /* invd */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->wbinvd == NULL);
         if ( (rc = ops->wbinvd(ctxt)) != 0 )
             goto done;
         break;
 
-    case 0x0b: /* ud2 */
-    case 0xb9: /* ud1 */
-    case 0xff: /* ud0 */
+    case X86EMUL_OPC(0x0f, 0x0b): /* ud2 */
+    case X86EMUL_OPC(0x0f, 0xb9): /* ud1 */
+    case X86EMUL_OPC(0x0f, 0xff): /* ud0 */
         generate_exception_if(1, EXC_UD, -1);
 
-    case 0x0d: /* GrpP (prefetch) */
-    case 0x18: /* Grp16 (prefetch/nop) */
-    case 0x19 ... 0x1f: /* nop (amd-defined) */
+    case X86EMUL_OPC(0x0f, 0x0d): /* GrpP (prefetch) */
+    case X86EMUL_OPC(0x0f, 0x18): /* Grp16 (prefetch/nop) */
+    case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
-    case 0x2b: /* {,v}movntp{s,d} xmm,m128 */
-               /* vmovntp{s,d} ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0x2b):        /* movntps xmm,m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* vmovntps xmm,m128 */
+                                         /* vmovntps ymm,m256 */
+    case X86EMUL_OPC_66(0x0f, 0x2b):     /* movntpd xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
+                                         /* vmovntpd ymm,m256 */
         fail_if(ea.type != OP_MEM);
         /* fall through */
-    case 0x28: /* {,v}movap{s,d} xmm/m128,xmm */
-               /* vmovap{s,d} ymm/m256,ymm */
-    case 0x29: /* {,v}movap{s,d} xmm,xmm/m128 */
-               /* vmovap{s,d} ymm,ymm/m256 */
-        fail_if(vex.pfx & VEX_PREFIX_SCALAR_MASK);
-        /* fall through */
-    case 0x10: /* {,v}movup{s,d} xmm/m128,xmm */
-               /* vmovup{s,d} ymm/m256,ymm */
-               /* {,v}movss xmm/m32,xmm */
-               /* {,v}movsd xmm/m64,xmm */
-    case 0x11: /* {,v}movup{s,d} xmm,xmm/m128 */
-               /* vmovup{s,d} ymm,ymm/m256 */
-               /* {,v}movss xmm,xmm/m32 */
-               /* {,v}movsd xmm,xmm/m64 */
+    case X86EMUL_OPC(0x0f, 0x28):        /* movaps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x28):    /* vmovaps xmm/m128,xmm */
+                                         /* vmovaps ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x28):     /* movapd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
+                                         /* vmovapd ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x29):        /* movaps xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* vmovaps xmm,xmm/m128 */
+                                         /* vmovaps ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x29):     /* movapd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
+                                         /* vmovapd ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x10):        /* movups xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x10):    /* vmovups xmm/m128,xmm */
+                                         /* vmovups ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x10):     /* movupd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
+                                         /* vmovupd ymm/m256,ymm */
+    case X86EMUL_OPC_F3(0x0f, 0x10):     /* movss xmm/m32,xmm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x10):     /* movsd xmm/m64,xmm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
+    case X86EMUL_OPC(0x0f, 0x11):        /* movups xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* vmovups xmm,xmm/m128 */
+                                         /* vmovups ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x11):     /* movupd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
+                                         /* vmovupd ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x11):     /* movss xmm,xmm/m32 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
+    case X86EMUL_OPC_F2(0x0f, 0x11):     /* movsd xmm,xmm/m64 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4396,10 +4451,9 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) ||
-                    ((vex.reg != 0xf) &&
-                     ((ea.type == OP_MEM) ||
-                      !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
+            fail_if((vex.reg != 0xf) &&
+                    ((ea.type == OP_MEM) ||
+                     !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4437,10 +4491,10 @@ x86_emulate(
         break;
     }
 
-    case 0x20: /* mov cr,reg */
-    case 0x21: /* mov dr,reg */
-    case 0x22: /* mov reg,cr */
-    case 0x23: /* mov reg,dr */
+    case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
+    case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
+    case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
+    case X86EMUL_OPC(0x0f, 0x23): /* mov reg,dr */
         generate_exception_if(ea.type != OP_REG, EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         modrm_reg |= lock_prefix << 3;
@@ -4476,7 +4530,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x30: /* wrmsr */ {
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ {
         uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);
@@ -4485,7 +4539,7 @@ x86_emulate(
         break;
     }
 
-    case 0x31: rdtsc: /* rdtsc */ {
+    case X86EMUL_OPC(0x0f, 0x31): rdtsc: /* rdtsc */ {
         unsigned long cr4;
         uint64_t val;
         if ( !mode_ring0() )
@@ -4503,7 +4557,7 @@ x86_emulate(
         break;
     }
 
-    case 0x32: /* rdmsr */ {
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ {
         uint64_t val;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->read_msr == NULL);
@@ -4514,13 +4568,13 @@ x86_emulate(
         break;
     }
 
-    case 0x40 ... 0x4f: /* cmovcc */
+    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
         dst.val = src.val;
         if ( !test_cc(b, _regs.eflags) )
             dst.type = OP_NONE;
         break;
 
-    case 0x34: /* sysenter */ {
+    case X86EMUL_OPC(0x0f, 0x34): /* sysenter */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         int lm;
@@ -4568,7 +4622,7 @@ x86_emulate(
         break;
     }
 
-    case 0x35: /* sysexit */ {
+    case X86EMUL_OPC(0x0f, 0x35): /* sysexit */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         bool_t user64 = !!(rex_prefix & REX_W);
@@ -4607,18 +4661,26 @@ x86_emulate(
         break;
     }
 
-    case 0xe7: /* movntq mm,m64 */
-               /* {,v}movntdq xmm,m128 */
-               /* vmovntdq ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
+    case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
+                                         /* vmovntdq ymm,m256 */
         fail_if(ea.type != OP_MEM);
-        fail_if(vex.pfx == vex_f3);
         /* fall through */
-    case 0x6f: /* movq mm/m64,mm */
-               /* {,v}movdq{a,u} xmm/m128,xmm */
-               /* vmovdq{a,u} ymm/m256,ymm */
-    case 0x7f: /* movq mm,mm/m64 */
-               /* {,v}movdq{a,u} xmm,xmm/m128 */
-               /* vmovdq{a,u} ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x6f):        /* movq mm/m64,mm */
+    case X86EMUL_OPC_66(0x0f, 0x6f):     /* movdqa xmm/m128,xmm */
+    case X86EMUL_OPC_F3(0x0f, 0x6f):     /* movdqu xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
+                                         /* vmovdqa ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
+                                         /* vmovdqu ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
+    case X86EMUL_OPC_66(0x0f, 0x7f):     /* movdqa xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
+                                         /* vmovdqa ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x7f):     /* movdqu xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
+                                         /* vmovdqu ymm,ymm/m256 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4654,8 +4716,7 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
-                    ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
+            fail_if(vex.reg != 0xf);
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4691,24 +4752,24 @@ x86_emulate(
         break;
     }
 
-    case 0x80 ... 0x8f: /* jcc (near) */
+    case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
         break;
 
-    case 0x90 ... 0x9f: /* setcc */
+    case X86EMUL_OPC(0x0f, 0x90) ... X86EMUL_OPC(0x0f, 0x9f): /* setcc */
         dst.val = test_cc(b, _regs.eflags);
         break;
 
-    case 0xa0: /* push %%fs */
+    case X86EMUL_OPC(0x0f, 0xa0): /* push %%fs */
         src.val = x86_seg_fs;
         goto push_seg;
 
-    case 0xa1: /* pop %%fs */
+    case X86EMUL_OPC(0x0f, 0xa1): /* pop %%fs */
         src.val = x86_seg_fs;
         goto pop_seg;
 
-    case 0xa2: /* cpuid */ {
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ {
         unsigned int eax = _regs.eax, ebx = _regs.ebx;
         unsigned int ecx = _regs.ecx, edx = _regs.edx;
         fail_if(ops->cpuid == NULL);
@@ -4719,15 +4780,15 @@ x86_emulate(
         break;
     }
 
-    case 0xa3: bt: /* bt */
+    case X86EMUL_OPC(0x0f, 0xa3): bt: /* bt */
         emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
 
-    case 0xa4: /* shld imm8,r,r/m */
-    case 0xa5: /* shld %%cl,r,r/m */
-    case 0xac: /* shrd imm8,r,r/m */
-    case 0xad: /* shrd %%cl,r,r/m */ {
+    case X86EMUL_OPC(0x0f, 0xa4): /* shld imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xa5): /* shld %%cl,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xac): /* shrd imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xad): /* shrd %%cl,r,r/m */ {
         uint8_t shift, width = dst.bytes << 3;
 
         generate_exception_if(lock_prefix, EXC_UD, -1);
@@ -4762,24 +4823,23 @@ x86_emulate(
         break;
     }
 
-    case 0xa8: /* push %%gs */
+    case X86EMUL_OPC(0x0f, 0xa8): /* push %%gs */
         src.val = x86_seg_gs;
         goto push_seg;
 
-    case 0xa9: /* pop %%gs */
+    case X86EMUL_OPC(0x0f, 0xa9): /* pop %%gs */
         src.val = x86_seg_gs;
         goto pop_seg;
 
-    case 0xab: bts: /* bts */
+    case X86EMUL_OPC(0x0f, 0xab): bts: /* bts */
         emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
         break;
 
-    case 0xae: /* Grp15 */
+    case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
         case 7: /* clflush{,opt} */
             fail_if(modrm_mod == 3);
-            fail_if(rep_prefix());
             fail_if(ops->wbinvd == NULL);
             if ( (rc = ops->wbinvd(ctxt)) != 0 )
                 goto done;
@@ -4789,11 +4849,11 @@ x86_emulate(
         }
         break;
 
-    case 0xaf: /* imul */
+    case X86EMUL_OPC(0x0f, 0xaf): /* imul */
         emulate_2op_SrcV_srcmem("imul", src, dst, _regs.eflags);
         break;
 
-    case 0xb0 ... 0xb1: /* cmpxchg */
+    case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
         /* Save real source value, then compare EAX against destination. */
         src.orig_val = src.val;
         src.val = _regs.eax;
@@ -4812,34 +4872,34 @@ x86_emulate(
         }
         break;
 
-    case 0xb2: /* lss */
+    case X86EMUL_OPC(0x0f, 0xb2): /* lss */
         dst.val = x86_seg_ss;
         goto les;
 
-    case 0xb3: btr: /* btr */
+    case X86EMUL_OPC(0x0f, 0xb3): btr: /* btr */
         emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
         break;
 
-    case 0xb4: /* lfs */
+    case X86EMUL_OPC(0x0f, 0xb4): /* lfs */
         dst.val = x86_seg_fs;
         goto les;
 
-    case 0xb5: /* lgs */
+    case X86EMUL_OPC(0x0f, 0xb5): /* lgs */
         dst.val = x86_seg_gs;
         goto les;
 
-    case 0xb6: /* movzx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb6): /* movzx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (uint8_t)src.val;
         break;
 
-    case 0xb7: /* movzx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb7): /* movzx rm16,r{16,32,64} */
         dst.val = (uint16_t)src.val;
         break;
 
-    case 0xba: /* Grp8 */
+    case X86EMUL_OPC(0x0f, 0xba): /* Grp8 */
         switch ( modrm_reg & 7 )
         {
         case 4: goto bt;
@@ -4850,11 +4910,11 @@ x86_emulate(
         }
         break;
 
-    case 0xbb: btc: /* btc */
+    case X86EMUL_OPC(0x0f, 0xbb): btc: /* btc */
         emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
         break;
 
-    case 0xbc: /* bsf or tzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbc): /* bsf or tzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4886,7 +4946,7 @@ x86_emulate(
         break;
     }
 
-    case 0xbd: /* bsr or lzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbd): /* bsr or lzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4922,18 +4982,18 @@ x86_emulate(
         break;
     }
 
-    case 0xbe: /* movsx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbe): /* movsx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (int8_t)src.val;
         break;
 
-    case 0xbf: /* movsx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbf): /* movsx rm16,r{16,32,64} */
         dst.val = (int16_t)src.val;
         break;
 
-    case 0xc0 ... 0xc1: /* xadd */
+    case X86EMUL_OPC(0x0f, 0xc0): case X86EMUL_OPC(0x0f, 0xc1): /* xadd */
         /* Write back the register source. */
         switch ( dst.bytes )
         {
@@ -4944,14 +5004,14 @@ x86_emulate(
         }
         goto add;
 
-    case 0xc3: /* movnti */
+    case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have_sse2();
         generate_exception_if(dst.bytes <= 2, EXC_UD, -1);
         dst.val = src.val;
         break;
 
-    case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
+    case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
         unsigned long old[2], exp[2], new[2];
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD, -1);
@@ -4995,7 +5055,7 @@ x86_emulate(
         break;
     }
 
-    case 0xc8 ... 0xcf: /* bswap */
+    case X86EMUL_OPC(0x0f, 0xc8) ... X86EMUL_OPC(0x0f, 0xcf): /* bswap */
         dst.type = OP_REG;
         dst.reg  = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
@@ -5016,72 +5076,57 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f38_insn:
-    switch ( b )
-    {
-    case 0xf0: case 0xf1: /* movbe / crc32 */
-        generate_exception_if(repe_prefix(), EXC_UD, -1);
-        if ( repne_prefix() )
+    case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
+    case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
+        vcpu_must_have_movbe();
+        switch ( op_bytes )
         {
-            /* crc32 */
-#ifdef HAVE_GAS_SSE4_2
-            host_and_vcpu_must_have(sse4_2);
-            dst.bytes = rex_prefix & REX_W ? 8 : 4;
-            switch ( op_bytes )
-            {
-            case 1:
-                asm ( "crc32b %1,%k0" : "+r" (dst.val)
-                                      : "qm" (*(uint8_t *)&src.val) );
-                break;
-            case 2:
-                asm ( "crc32w %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint16_t *)&src.val) );
-                break;
-            case 4:
-                asm ( "crc32l %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint32_t *)&src.val) );
-                break;
-# ifdef __x86_64__
-            case 8:
-                asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
-                break;
-# endif
-            default:
-                ASSERT_UNREACHABLE();
-            }
-#else /* !HAVE_GAS_SSE4_2 */
-            goto cannot_emulate;
+        case 2:
+            asm ( "xchg %h0,%b0" : "=Q" (dst.val)
+                                 : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 4:
+#ifdef __x86_64__
+            asm ( "bswap %k0" : "=r" (dst.val)
+                              : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 8:
 #endif
+            asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
+            break;
+        default:
+            ASSERT_UNREACHABLE();
         }
-        else
+        break;
+#ifdef HAVE_GAS_SSE4_2
+    case X86EMUL_OPC_F2(0x0f38, 0xf0): /* crc32 r/m8, r{32,64} */
+    case X86EMUL_OPC_F2(0x0f38, 0xf1): /* crc32 r/m{16,32,64}, r{32,64} */
+        host_and_vcpu_must_have(sse4_2);
+        dst.bytes = rex_prefix & REX_W ? 8 : 4;
+        switch ( op_bytes )
         {
-            /* movbe */
-            vcpu_must_have_movbe();
-            switch ( op_bytes )
-            {
-            case 2:
-                asm ( "xchg %h0,%b0" : "=Q" (dst.val)
-                                     : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 4:
-#ifdef __x86_64__
-                asm ( "bswap %k0" : "=r" (dst.val)
-                                  : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 8:
-#endif
-                asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
-                break;
-            default:
-                ASSERT_UNREACHABLE();
-            }
+        case 1:
+            asm ( "crc32b %1,%k0" : "+r" (dst.val)
+                                  : "qm" (*(uint8_t *)&src.val) );
+            break;
+        case 2:
+            asm ( "crc32w %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint16_t *)&src.val) );
+            break;
+        case 4:
+            asm ( "crc32l %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint32_t *)&src.val) );
+            break;
+# ifdef __x86_64__
+        case 8:
+            asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
+            break;
+# endif
+        default:
+            ASSERT_UNREACHABLE();
         }
         break;
+#endif
     default:
         goto cannot_emulate;
     }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -415,12 +415,15 @@ struct x86_emulate_ctxt
     /* Stack pointer width in bits (16, 32 or 64). */
     unsigned int sp_size;
 
-    /* Set this if writes may have side effects. */
-    uint8_t force_writeback;
+    /* Canonical opcode (see below). */
+    unsigned int opcode;
 
     /* Software event injection support. */
     enum x86_swint_emulation swint_emulate;
 
+    /* Set this if writes may have side effects. */
+    uint8_t force_writeback;
+
     /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */
     union {
         struct {
@@ -435,6 +438,51 @@ struct x86_emulate_ctxt
     void *data;
 };
 
+/*
+ * This encodes the opcode extension in a "natural" way:
+ *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
+ *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
+ *  0x8f0axxxx for 8f/a-prefixed XOP opcodes
+ * Hence no separate #define-s get added.
+ */
+#define X86EMUL_OPC_EXT_MASK         0xffff0000
+#define X86EMUL_OPC(ext, byte)       ((byte) | \
+                                      MASK_INSR((ext), X86EMUL_OPC_EXT_MASK))
+/*
+ * This includes the 0x66, 0xF3, and 0xF2 prefixes when used to alter
+ * functionality instead of just insn attributes, as well as VEX/EVEX:
+ */
+#define X86EMUL_OPC_MASK             (0x000000ff | X86EMUL_OPC_PFX_MASK | \
+                                     X86EMUL_OPC_KIND_MASK)
+
+#define X86EMUL_OPC_PFX_MASK         0x00000300
+# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
+# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
+# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
+
+#define X86EMUL_OPC_KIND_MASK        0x00003000
+#define X86EMUL_OPC_VEX_             0x00001000
+# define X86EMUL_OPC_VEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
+#define X86EMUL_OPC_EVEX_            0x00002000
+# define X86EMUL_OPC_EVEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
+
 struct x86_emulate_stub {
     union {
         void (*func)(void);



[-- Attachment #2: x86emul-opcode-canon.patch --]
[-- Type: text/plain, Size: 33606 bytes --]

x86emul: generate and make use of canonical opcode representation

This representation is then being made available to interested callers,
to facilitate replacing their custom decoding.

This entails combining the three main switch statements into one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -14,6 +14,9 @@ typedef bool bool_t;
 #define ASSERT assert
 #define ASSERT_UNREACHABLE() assert(!__LINE__)
 
+#define MASK_EXTR(v, m) (((v) & (m)) / ((m) & -(m)))
+#define MASK_INSR(v, m) (((v) * ((m) & -(m))) & (m))
+
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1611,7 +1611,6 @@ struct x86_emulate_state {
         ext_8f09,
         ext_8f0a,
     } ext;
-    uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
     bool lock_prefix;
@@ -1657,7 +1656,7 @@ x86_decode_base(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode )
     {
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
@@ -1696,11 +1695,9 @@ x86_decode_twobyte(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
     {
     case 0x78:
-        if ( vex.opcx )
-            break;
         switch ( vex.pfx )
         {
         case vex_66: /* extrq $imm8, $imm8, xmm */
@@ -1709,7 +1706,23 @@ x86_decode_twobyte(
             imm2 = insn_fetch_type(uint8_t);
             break;
         }
-        break;
+        /* fall through */
+    case 0x10 ... 0x18:
+    case 0x28 ... 0x2f:
+    case 0x50 ... 0x77:
+    case 0x79 ... 0x7f:
+    case 0xae:
+    case 0xc2:
+    case 0xc4 ... 0xc7:
+    case 0xd0 ... 0xfe:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+        /* Intentionally not handling here despite being modified by F3:
+    case 0xb8: jmpe / popcnt
+    case 0xbc: bsf / tzcnt
+    case 0xbd: bsr / lzcnt
+         * They're being dealt with in the execution phase (if at all).
+         */
     }
 
  done:
@@ -1717,13 +1730,35 @@ x86_decode_twobyte(
 }
 
 static int
+x86_decode_0f38(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+    {
+    case 0x00 ... 0xef:
+    case 0xf2 ... 0xff:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
+    case 0xf0: case 0xf1: /* movbe / crc32 */
+        if ( rep_prefix() )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
     uint8_t b, d, sib, sib_index, sib_base;
-    unsigned int def_op_bytes, def_ad_bytes;
+    unsigned int def_op_bytes, def_ad_bytes, opcode;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
@@ -1804,29 +1839,31 @@ x86_decode(
 
     /* Opcode byte(s). */
     d = opcode_table[b];
-    if ( d == 0 )
+    if ( d == 0 && b == 0x0f)
     {
-        /* Two-byte opcode? */
-        if ( b == 0x0f )
+        /* Two-byte opcode. */
+        b = insn_fetch_type(uint8_t);
+        d = twobyte_table[b];
+        switch ( b )
         {
+        default:
+            opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f;
+            break;
+        case 0x38:
             b = insn_fetch_type(uint8_t);
-            d = twobyte_table[b];
-            switch ( b )
-            {
-            default:
-                ext = ext_0f;
-                break;
-            case 0x38:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f38;
-                break;
-            case 0x3a:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f3a;
-                break;
-            }
+            opcode = b | MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f38;
+            break;
+        case 0x3a:
+            b = insn_fetch_type(uint8_t);
+            opcode = b | MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f3a;
+            break;
         }
     }
+    else
+        opcode = b;
 
     /* ModRM and SIB bytes. */
     if ( d & ModRM )
@@ -1855,6 +1892,7 @@ x86_decode(
                 vex.raw[0] = modrm;
                 if ( b == 0xc5 )
                 {
+                    opcode = X86EMUL_OPC_VEX_;
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
                     vex.x = 1;
@@ -1876,31 +1914,44 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
-                    if ( b == 0x62 )
+                    switch ( b )
                     {
+                    case 0x62:
+                        opcode = X86EMUL_OPC_EVEX_;
                         evex.raw[0] = vex.raw[0];
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
                         vex.opcx = evex.opcx;
+                        break;
+                    case 0xc4:
+                        opcode = X86EMUL_OPC_VEX_;
+                        break;
+                    default:
+                        opcode = 0;
+                        break;
                     }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
+                opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
                 ext = vex.opcx;
                 if ( b != 0x8f )
                 {
                     switch ( ext )
                     {
                     case vex_0f:
+                        opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[b];
                         break;
                     case vex_0f38:
+                        opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x38];
                         break;
                     case vex_0f3a:
+                        opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x3a];
                         break;
                     default:
@@ -1910,7 +1961,11 @@ x86_decode(
                 }
                 else if ( ext < ext_8f08 +
                                 sizeof(xop_table) / sizeof(*xop_table) )
+                {
+                    opcode |= MASK_INSR(0x8f08 + ext - ext_8f08,
+                                        X86EMUL_OPC_EXT_MASK);
                     d = xop_table[ext - ext_8f08];
+                }
                 else
                 {
                     rc = X86EMUL_UNHANDLEABLE;
@@ -1980,9 +2035,7 @@ x86_decode(
             break;
 
         case ext_0f38:
-            if ( vex.opcx )
-                break;
-            switch ( b )
+            switch ( opcode & X86EMUL_OPC_MASK )
             {
             case 0xf0: /* movbe / crc32 */
                 d |= repne_prefix() ? ByteOp : Mov;
@@ -1991,8 +2044,6 @@ x86_decode(
                 if ( !repne_prefix() )
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
-            default: /* Until it is worth making this table based ... */
-                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -2153,7 +2204,7 @@ x86_decode(
         break;
     }
 
-    state->opcode = b;
+    ctxt->opcode = opcode;
     state->desc = d;
 
     switch ( ext )
@@ -2167,7 +2218,14 @@ x86_decode(
         break;
 
     case ext_0f38:
+        rc = x86_decode_0f38(state, ctxt, ops);
+        break;
+
     case ext_0f3a:
+        if ( !vex.opcx )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
     case ext_8f08:
     case ext_8f09:
     case ext_8f0a:
@@ -2209,7 +2267,7 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
-    b = state.opcode;
+    b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
 
@@ -2380,24 +2438,7 @@ x86_emulate(
         break;
     }
 
-    switch ( ext )
-    {
-    case ext_none:
-        break;
-    case ext_0f:
-        goto ext_0f_insn;
-    case ext_0f38:
-        goto ext_0f38_insn;
-    default:
-        ASSERT_UNREACHABLE();
-    case ext_0f3a:
-    case ext_8f08:
-    case ext_8f09:
-    case ext_8f0a:
-        goto cannot_emulate;
-    }
-
-    switch ( b )
+    switch ( ctxt->opcode )
     {
         struct segment_register cs;
 
@@ -4108,15 +4149,7 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f_insn:
-    switch ( b )
-    {
-    case 0x00: /* Grp6 */
+    case X86EMUL_OPC(0x0f, 0x00): /* Grp6 */
         fail_if((modrm_reg & 6) != 2);
         generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4125,7 +4158,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x01: /* Grp7 */ {
+    case X86EMUL_OPC(0x0f, 0x01): /* Grp7 */ {
         struct segment_register reg;
         unsigned long base, limit, cr0, cr0w;
 
@@ -4270,7 +4303,7 @@ x86_emulate(
         break;
     }
 
-    case 0x05: /* syscall */ {
+    case X86EMUL_OPC(0x0f, 0x05): /* syscall */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
 
@@ -4330,7 +4363,7 @@ x86_emulate(
         break;
     }
 
-    case 0x06: /* clts */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL));
         if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ||
@@ -4338,42 +4371,64 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x08: /* invd */
-    case 0x09: /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x08): /* invd */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->wbinvd == NULL);
         if ( (rc = ops->wbinvd(ctxt)) != 0 )
             goto done;
         break;
 
-    case 0x0b: /* ud2 */
-    case 0xb9: /* ud1 */
-    case 0xff: /* ud0 */
+    case X86EMUL_OPC(0x0f, 0x0b): /* ud2 */
+    case X86EMUL_OPC(0x0f, 0xb9): /* ud1 */
+    case X86EMUL_OPC(0x0f, 0xff): /* ud0 */
         generate_exception_if(1, EXC_UD, -1);
 
-    case 0x0d: /* GrpP (prefetch) */
-    case 0x18: /* Grp16 (prefetch/nop) */
-    case 0x19 ... 0x1f: /* nop (amd-defined) */
+    case X86EMUL_OPC(0x0f, 0x0d): /* GrpP (prefetch) */
+    case X86EMUL_OPC(0x0f, 0x18): /* Grp16 (prefetch/nop) */
+    case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
-    case 0x2b: /* {,v}movntp{s,d} xmm,m128 */
-               /* vmovntp{s,d} ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0x2b):        /* movntps xmm,m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* vmovntps xmm,m128 */
+                                         /* vmovntps ymm,m256 */
+    case X86EMUL_OPC_66(0x0f, 0x2b):     /* movntpd xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
+                                         /* vmovntpd ymm,m256 */
         fail_if(ea.type != OP_MEM);
         /* fall through */
-    case 0x28: /* {,v}movap{s,d} xmm/m128,xmm */
-               /* vmovap{s,d} ymm/m256,ymm */
-    case 0x29: /* {,v}movap{s,d} xmm,xmm/m128 */
-               /* vmovap{s,d} ymm,ymm/m256 */
-        fail_if(vex.pfx & VEX_PREFIX_SCALAR_MASK);
-        /* fall through */
-    case 0x10: /* {,v}movup{s,d} xmm/m128,xmm */
-               /* vmovup{s,d} ymm/m256,ymm */
-               /* {,v}movss xmm/m32,xmm */
-               /* {,v}movsd xmm/m64,xmm */
-    case 0x11: /* {,v}movup{s,d} xmm,xmm/m128 */
-               /* vmovup{s,d} ymm,ymm/m256 */
-               /* {,v}movss xmm,xmm/m32 */
-               /* {,v}movsd xmm,xmm/m64 */
+    case X86EMUL_OPC(0x0f, 0x28):        /* movaps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x28):    /* vmovaps xmm/m128,xmm */
+                                         /* vmovaps ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x28):     /* movapd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
+                                         /* vmovapd ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x29):        /* movaps xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* vmovaps xmm,xmm/m128 */
+                                         /* vmovaps ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x29):     /* movapd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
+                                         /* vmovapd ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x10):        /* movups xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x10):    /* vmovups xmm/m128,xmm */
+                                         /* vmovups ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x10):     /* movupd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
+                                         /* vmovupd ymm/m256,ymm */
+    case X86EMUL_OPC_F3(0x0f, 0x10):     /* movss xmm/m32,xmm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x10):     /* movsd xmm/m64,xmm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
+    case X86EMUL_OPC(0x0f, 0x11):        /* movups xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* vmovups xmm,xmm/m128 */
+                                         /* vmovups ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x11):     /* movupd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
+                                         /* vmovupd ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x11):     /* movss xmm,xmm/m32 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
+    case X86EMUL_OPC_F2(0x0f, 0x11):     /* movsd xmm,xmm/m64 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4396,10 +4451,9 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) ||
-                    ((vex.reg != 0xf) &&
-                     ((ea.type == OP_MEM) ||
-                      !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
+            fail_if((vex.reg != 0xf) &&
+                    ((ea.type == OP_MEM) ||
+                     !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4437,10 +4491,10 @@ x86_emulate(
         break;
     }
 
-    case 0x20: /* mov cr,reg */
-    case 0x21: /* mov dr,reg */
-    case 0x22: /* mov reg,cr */
-    case 0x23: /* mov reg,dr */
+    case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
+    case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
+    case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
+    case X86EMUL_OPC(0x0f, 0x23): /* mov reg,dr */
         generate_exception_if(ea.type != OP_REG, EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         modrm_reg |= lock_prefix << 3;
@@ -4476,7 +4530,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x30: /* wrmsr */ {
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ {
         uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);
@@ -4485,7 +4539,7 @@ x86_emulate(
         break;
     }
 
-    case 0x31: rdtsc: /* rdtsc */ {
+    case X86EMUL_OPC(0x0f, 0x31): rdtsc: /* rdtsc */ {
         unsigned long cr4;
         uint64_t val;
         if ( !mode_ring0() )
@@ -4503,7 +4557,7 @@ x86_emulate(
         break;
     }
 
-    case 0x32: /* rdmsr */ {
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ {
         uint64_t val;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->read_msr == NULL);
@@ -4514,13 +4568,13 @@ x86_emulate(
         break;
     }
 
-    case 0x40 ... 0x4f: /* cmovcc */
+    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
         dst.val = src.val;
         if ( !test_cc(b, _regs.eflags) )
             dst.type = OP_NONE;
         break;
 
-    case 0x34: /* sysenter */ {
+    case X86EMUL_OPC(0x0f, 0x34): /* sysenter */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         int lm;
@@ -4568,7 +4622,7 @@ x86_emulate(
         break;
     }
 
-    case 0x35: /* sysexit */ {
+    case X86EMUL_OPC(0x0f, 0x35): /* sysexit */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         bool_t user64 = !!(rex_prefix & REX_W);
@@ -4607,18 +4661,26 @@ x86_emulate(
         break;
     }
 
-    case 0xe7: /* movntq mm,m64 */
-               /* {,v}movntdq xmm,m128 */
-               /* vmovntdq ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
+    case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
+                                         /* vmovntdq ymm,m256 */
         fail_if(ea.type != OP_MEM);
-        fail_if(vex.pfx == vex_f3);
         /* fall through */
-    case 0x6f: /* movq mm/m64,mm */
-               /* {,v}movdq{a,u} xmm/m128,xmm */
-               /* vmovdq{a,u} ymm/m256,ymm */
-    case 0x7f: /* movq mm,mm/m64 */
-               /* {,v}movdq{a,u} xmm,xmm/m128 */
-               /* vmovdq{a,u} ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x6f):        /* movq mm/m64,mm */
+    case X86EMUL_OPC_66(0x0f, 0x6f):     /* movdqa xmm/m128,xmm */
+    case X86EMUL_OPC_F3(0x0f, 0x6f):     /* movdqu xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
+                                         /* vmovdqa ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
+                                         /* vmovdqu ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
+    case X86EMUL_OPC_66(0x0f, 0x7f):     /* movdqa xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
+                                         /* vmovdqa ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x7f):     /* movdqu xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
+                                         /* vmovdqu ymm,ymm/m256 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4654,8 +4716,7 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
-                    ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
+            fail_if(vex.reg != 0xf);
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4691,24 +4752,24 @@ x86_emulate(
         break;
     }
 
-    case 0x80 ... 0x8f: /* jcc (near) */
+    case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
         break;
 
-    case 0x90 ... 0x9f: /* setcc */
+    case X86EMUL_OPC(0x0f, 0x90) ... X86EMUL_OPC(0x0f, 0x9f): /* setcc */
         dst.val = test_cc(b, _regs.eflags);
         break;
 
-    case 0xa0: /* push %%fs */
+    case X86EMUL_OPC(0x0f, 0xa0): /* push %%fs */
         src.val = x86_seg_fs;
         goto push_seg;
 
-    case 0xa1: /* pop %%fs */
+    case X86EMUL_OPC(0x0f, 0xa1): /* pop %%fs */
         src.val = x86_seg_fs;
         goto pop_seg;
 
-    case 0xa2: /* cpuid */ {
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ {
         unsigned int eax = _regs.eax, ebx = _regs.ebx;
         unsigned int ecx = _regs.ecx, edx = _regs.edx;
         fail_if(ops->cpuid == NULL);
@@ -4719,15 +4780,15 @@ x86_emulate(
         break;
     }
 
-    case 0xa3: bt: /* bt */
+    case X86EMUL_OPC(0x0f, 0xa3): bt: /* bt */
         emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
 
-    case 0xa4: /* shld imm8,r,r/m */
-    case 0xa5: /* shld %%cl,r,r/m */
-    case 0xac: /* shrd imm8,r,r/m */
-    case 0xad: /* shrd %%cl,r,r/m */ {
+    case X86EMUL_OPC(0x0f, 0xa4): /* shld imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xa5): /* shld %%cl,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xac): /* shrd imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xad): /* shrd %%cl,r,r/m */ {
         uint8_t shift, width = dst.bytes << 3;
 
         generate_exception_if(lock_prefix, EXC_UD, -1);
@@ -4762,24 +4823,23 @@ x86_emulate(
         break;
     }
 
-    case 0xa8: /* push %%gs */
+    case X86EMUL_OPC(0x0f, 0xa8): /* push %%gs */
         src.val = x86_seg_gs;
         goto push_seg;
 
-    case 0xa9: /* pop %%gs */
+    case X86EMUL_OPC(0x0f, 0xa9): /* pop %%gs */
         src.val = x86_seg_gs;
         goto pop_seg;
 
-    case 0xab: bts: /* bts */
+    case X86EMUL_OPC(0x0f, 0xab): bts: /* bts */
         emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
         break;
 
-    case 0xae: /* Grp15 */
+    case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
         case 7: /* clflush{,opt} */
             fail_if(modrm_mod == 3);
-            fail_if(rep_prefix());
             fail_if(ops->wbinvd == NULL);
             if ( (rc = ops->wbinvd(ctxt)) != 0 )
                 goto done;
@@ -4789,11 +4849,11 @@ x86_emulate(
         }
         break;
 
-    case 0xaf: /* imul */
+    case X86EMUL_OPC(0x0f, 0xaf): /* imul */
         emulate_2op_SrcV_srcmem("imul", src, dst, _regs.eflags);
         break;
 
-    case 0xb0 ... 0xb1: /* cmpxchg */
+    case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
         /* Save real source value, then compare EAX against destination. */
         src.orig_val = src.val;
         src.val = _regs.eax;
@@ -4812,34 +4872,34 @@ x86_emulate(
         }
         break;
 
-    case 0xb2: /* lss */
+    case X86EMUL_OPC(0x0f, 0xb2): /* lss */
         dst.val = x86_seg_ss;
         goto les;
 
-    case 0xb3: btr: /* btr */
+    case X86EMUL_OPC(0x0f, 0xb3): btr: /* btr */
         emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
         break;
 
-    case 0xb4: /* lfs */
+    case X86EMUL_OPC(0x0f, 0xb4): /* lfs */
         dst.val = x86_seg_fs;
         goto les;
 
-    case 0xb5: /* lgs */
+    case X86EMUL_OPC(0x0f, 0xb5): /* lgs */
         dst.val = x86_seg_gs;
         goto les;
 
-    case 0xb6: /* movzx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb6): /* movzx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (uint8_t)src.val;
         break;
 
-    case 0xb7: /* movzx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb7): /* movzx rm16,r{16,32,64} */
         dst.val = (uint16_t)src.val;
         break;
 
-    case 0xba: /* Grp8 */
+    case X86EMUL_OPC(0x0f, 0xba): /* Grp8 */
         switch ( modrm_reg & 7 )
         {
         case 4: goto bt;
@@ -4850,11 +4910,11 @@ x86_emulate(
         }
         break;
 
-    case 0xbb: btc: /* btc */
+    case X86EMUL_OPC(0x0f, 0xbb): btc: /* btc */
         emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
         break;
 
-    case 0xbc: /* bsf or tzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbc): /* bsf or tzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4886,7 +4946,7 @@ x86_emulate(
         break;
     }
 
-    case 0xbd: /* bsr or lzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbd): /* bsr or lzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4922,18 +4982,18 @@ x86_emulate(
         break;
     }
 
-    case 0xbe: /* movsx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbe): /* movsx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (int8_t)src.val;
         break;
 
-    case 0xbf: /* movsx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbf): /* movsx rm16,r{16,32,64} */
         dst.val = (int16_t)src.val;
         break;
 
-    case 0xc0 ... 0xc1: /* xadd */
+    case X86EMUL_OPC(0x0f, 0xc0): case X86EMUL_OPC(0x0f, 0xc1): /* xadd */
         /* Write back the register source. */
         switch ( dst.bytes )
         {
@@ -4944,14 +5004,14 @@ x86_emulate(
         }
         goto add;
 
-    case 0xc3: /* movnti */
+    case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have_sse2();
         generate_exception_if(dst.bytes <= 2, EXC_UD, -1);
         dst.val = src.val;
         break;
 
-    case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
+    case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
         unsigned long old[2], exp[2], new[2];
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD, -1);
@@ -4995,7 +5055,7 @@ x86_emulate(
         break;
     }
 
-    case 0xc8 ... 0xcf: /* bswap */
+    case X86EMUL_OPC(0x0f, 0xc8) ... X86EMUL_OPC(0x0f, 0xcf): /* bswap */
         dst.type = OP_REG;
         dst.reg  = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
@@ -5016,72 +5076,57 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f38_insn:
-    switch ( b )
-    {
-    case 0xf0: case 0xf1: /* movbe / crc32 */
-        generate_exception_if(repe_prefix(), EXC_UD, -1);
-        if ( repne_prefix() )
+    case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
+    case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
+        vcpu_must_have_movbe();
+        switch ( op_bytes )
         {
-            /* crc32 */
-#ifdef HAVE_GAS_SSE4_2
-            host_and_vcpu_must_have(sse4_2);
-            dst.bytes = rex_prefix & REX_W ? 8 : 4;
-            switch ( op_bytes )
-            {
-            case 1:
-                asm ( "crc32b %1,%k0" : "+r" (dst.val)
-                                      : "qm" (*(uint8_t *)&src.val) );
-                break;
-            case 2:
-                asm ( "crc32w %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint16_t *)&src.val) );
-                break;
-            case 4:
-                asm ( "crc32l %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint32_t *)&src.val) );
-                break;
-# ifdef __x86_64__
-            case 8:
-                asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
-                break;
-# endif
-            default:
-                ASSERT_UNREACHABLE();
-            }
-#else /* !HAVE_GAS_SSE4_2 */
-            goto cannot_emulate;
+        case 2:
+            asm ( "xchg %h0,%b0" : "=Q" (dst.val)
+                                 : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 4:
+#ifdef __x86_64__
+            asm ( "bswap %k0" : "=r" (dst.val)
+                              : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 8:
 #endif
+            asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
+            break;
+        default:
+            ASSERT_UNREACHABLE();
         }
-        else
+        break;
+#ifdef HAVE_GAS_SSE4_2
+    case X86EMUL_OPC_F2(0x0f38, 0xf0): /* crc32 r/m8, r{32,64} */
+    case X86EMUL_OPC_F2(0x0f38, 0xf1): /* crc32 r/m{16,32,64}, r{32,64} */
+        host_and_vcpu_must_have(sse4_2);
+        dst.bytes = rex_prefix & REX_W ? 8 : 4;
+        switch ( op_bytes )
         {
-            /* movbe */
-            vcpu_must_have_movbe();
-            switch ( op_bytes )
-            {
-            case 2:
-                asm ( "xchg %h0,%b0" : "=Q" (dst.val)
-                                     : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 4:
-#ifdef __x86_64__
-                asm ( "bswap %k0" : "=r" (dst.val)
-                                  : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 8:
-#endif
-                asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
-                break;
-            default:
-                ASSERT_UNREACHABLE();
-            }
+        case 1:
+            asm ( "crc32b %1,%k0" : "+r" (dst.val)
+                                  : "qm" (*(uint8_t *)&src.val) );
+            break;
+        case 2:
+            asm ( "crc32w %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint16_t *)&src.val) );
+            break;
+        case 4:
+            asm ( "crc32l %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint32_t *)&src.val) );
+            break;
+# ifdef __x86_64__
+        case 8:
+            asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
+            break;
+# endif
+        default:
+            ASSERT_UNREACHABLE();
         }
         break;
+#endif
     default:
         goto cannot_emulate;
     }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -415,12 +415,15 @@ struct x86_emulate_ctxt
     /* Stack pointer width in bits (16, 32 or 64). */
     unsigned int sp_size;
 
-    /* Set this if writes may have side effects. */
-    uint8_t force_writeback;
+    /* Canonical opcode (see below). */
+    unsigned int opcode;
 
     /* Software event injection support. */
     enum x86_swint_emulation swint_emulate;
 
+    /* Set this if writes may have side effects. */
+    uint8_t force_writeback;
+
     /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */
     union {
         struct {
@@ -435,6 +438,51 @@ struct x86_emulate_ctxt
     void *data;
 };
 
+/*
+ * This encodes the opcode extension in a "natural" way:
+ *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
+ *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
+ *  0x8f0axxxx for 8f/a-prefixed XOP opcodes
+ * Hence no separate #define-s get added.
+ */
+#define X86EMUL_OPC_EXT_MASK         0xffff0000
+#define X86EMUL_OPC(ext, byte)       ((byte) | \
+                                      MASK_INSR((ext), X86EMUL_OPC_EXT_MASK))
+/*
+ * This includes the 0x66, 0xF3, and 0xF2 prefixes when used to alter
+ * functionality instead of just insn attributes, as well as VEX/EVEX:
+ */
+#define X86EMUL_OPC_MASK             (0x000000ff | X86EMUL_OPC_PFX_MASK | \
+                                     X86EMUL_OPC_KIND_MASK)
+
+#define X86EMUL_OPC_PFX_MASK         0x00000300
+# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
+# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
+# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
+
+#define X86EMUL_OPC_KIND_MASK        0x00003000
+#define X86EMUL_OPC_VEX_             0x00001000
+# define X86EMUL_OPC_VEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
+#define X86EMUL_OPC_EVEX_            0x00002000
+# define X86EMUL_OPC_EVEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
+
 struct x86_emulate_stub {
     union {
         void (*func)(void);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 09/17] SVM: use generic instruction decoding
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (8 preceding siblings ...)
  2016-09-08 13:14 ` [PATCH 08/17] x86emul: generate and make use of canonical opcode representation Jan Beulich
@ 2016-09-08 13:14 ` Jan Beulich
  2016-09-14 17:56   ` Andrew Cooper
  2016-09-08 13:16 ` [PATCH 10/17] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
                   ` (7 subsequent siblings)
  17 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Boris Ostrovsky, Suravee Suthikulpanit

[-- Attachment #1: Type: text/plain, Size: 17369 bytes --]

... instead of custom handling. To facilitate this break out init code
from _hvm_emulate_one() into the new hvm_emulate_init(), and make
hvmemul_insn_fetch( globally available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -835,7 +835,7 @@ static int hvmemul_read(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-static int hvmemul_insn_fetch(
+int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
     void *p_data,
@@ -1765,15 +1765,14 @@ static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
-static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes)
 {
-    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
-    uint32_t new_intr_shadow, pfec = PFEC_page_present;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    unsigned int pfec = PFEC_page_present;
     unsigned long addr;
-    int rc;
 
     if ( hvm_long_mode_enabled(curr) &&
          hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
@@ -1791,14 +1790,14 @@ static int _hvm_emulate_one(struct hvm_e
     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip;
+    if ( !insn_bytes )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
-                                        regs->eip,
+                                        hvmemul_ctxt->insn_buf_eip,
                                         sizeof(hvmemul_ctxt->insn_buf),
                                         hvm_access_insn_fetch,
                                         hvmemul_ctxt->ctxt.addr_size,
@@ -1810,11 +1809,24 @@ static int _hvm_emulate_one(struct hvm_e
     }
     else
     {
-        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
     }
 
     hvmemul_ctxt->exn_pending = 0;
+}
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+
+    hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes);
+
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -15,7 +15,7 @@
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/config.h>
+#include <xen/err.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -26,41 +26,6 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-static unsigned int is_prefix(u8 opc)
-{
-    switch ( opc )
-    {
-    case 0x66:
-    case 0x67:
-    case 0x2E:
-    case 0x3E:
-    case 0x26:
-    case 0x64:
-    case 0x65:
-    case 0x36:
-    case 0xF0:
-    case 0xF3:
-    case 0xF2:
-    case 0x40 ... 0x4f:
-        return 1;
-    }
-    return 0;
-}
-
-static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit)
-{
-    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned long p = vmcb->cs.base + vmcb->rip;
-
-    if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) )
-    {
-        *limit = vmcb->cs.limit;
-        return (u32)p; /* mask to 32 bits */
-    }
-    *limit = ~0UL;
-    return p;
-}
-
 static unsigned long svm_nextrip_insn_length(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
     return vmcb->nextrip - vmcb->rip;
 }
 
-/* First byte: Length. Following bytes: Opcode bytes. */
-#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
-MAKE_INSTR(INVD,   2, 0x0f, 0x08);
-MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
-MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
-MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
-MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
-MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
-MAKE_INSTR(HLT,    1, 0xf4);
-MAKE_INSTR(INT3,   1, 0xcc);
-MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
-MAKE_INSTR(PAUSE,  1, 0x90);
-MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
-MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
-MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
-MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
-MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
-MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
-MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
-
-static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
-{
-    [INSTR_INVD]   = OPCODE_INVD,
-    [INSTR_WBINVD] = OPCODE_WBINVD,
-    [INSTR_CPUID]  = OPCODE_CPUID,
-    [INSTR_RDMSR]  = OPCODE_RDMSR,
-    [INSTR_WRMSR]  = OPCODE_WRMSR,
-    [INSTR_VMCALL] = OPCODE_VMCALL,
-    [INSTR_HLT]    = OPCODE_HLT,
-    [INSTR_INT3]   = OPCODE_INT3,
-    [INSTR_RDTSC]  = OPCODE_RDTSC,
-    [INSTR_PAUSE]  = OPCODE_PAUSE,
-    [INSTR_XSETBV] = OPCODE_XSETBV,
-    [INSTR_VMRUN]  = OPCODE_VMRUN,
-    [INSTR_VMLOAD] = OPCODE_VMLOAD,
-    [INSTR_VMSAVE] = OPCODE_VMSAVE,
-    [INSTR_STGI]   = OPCODE_STGI,
-    [INSTR_CLGI]   = OPCODE_CLGI,
-    [INSTR_INVLPGA] = OPCODE_INVLPGA,
+static const struct {
+    unsigned int opcode;
+    struct {
+        unsigned int rm:3;
+        unsigned int reg:3;
+        unsigned int mod:2;
+#define MODRM(mod, reg, rm) { rm, reg, mod }
+    } modrm;
+} const opc_tab[INSTR_MAX_COUNT] = {
+    [INSTR_PAUSE]  = { X86EMUL_OPC_F3(0, 0x90) },
+    [INSTR_INT3]   = { X86EMUL_OPC(   0, 0xcc) },
+    [INSTR_HLT]    = { X86EMUL_OPC(   0, 0xf4) },
+    [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
+    [INSTR_VMRUN]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
+    [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
+    [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
+    [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
+    [INSTR_STGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
+    [INSTR_CLGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
+    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
+    [INSTR_INVD]   = { X86EMUL_OPC(0x0f, 0x08) },
+    [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) },
+    [INSTR_WRMSR]  = { X86EMUL_OPC(0x0f, 0x30) },
+    [INSTR_RDTSC]  = { X86EMUL_OPC(0x0f, 0x31) },
+    [INSTR_RDMSR]  = { X86EMUL_OPC(0x0f, 0x32) },
+    [INSTR_CPUID]  = { X86EMUL_OPC(0x0f, 0xa2) },
 };
 
-static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
-                    unsigned long addr, unsigned int len)
-{
-    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
-
-    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
-    {
-    case HVMCOPY_okay:
-        break;
-    case HVMCOPY_bad_gva_to_gfn:
-        /* OK just to give up; we'll have injected #PF already */
-        return 0;
-    default:
-        /* Not OK: fetches from non-RAM pages are not supportable. */
-        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
-                 vmcb->rip, addr);
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 0;
-    }
-    return 1;
-}
-
 int __get_instruction_length_from_list(struct vcpu *v,
         const enum instruction_index *list, unsigned int list_count)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int i, j, inst_len = 0;
-    enum instruction_index instr = 0;
-    u8 buf[MAX_INST_LEN];
-    const u8 *opcode = NULL;
-    unsigned long fetch_addr, fetch_limit;
-    unsigned int fetch_len, max_len;
+    struct hvm_emulate_ctxt ctxt;
+    struct x86_emulate_state *state;
+    unsigned int inst_len, j, modrm_rm, modrm_reg;
+    int modrm_mod;
 
+#ifdef NDEBUG
     if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
         return inst_len;
 
     if ( vmcb->exitcode == VMEXIT_IOIO )
         return vmcb->exitinfo2 - vmcb->rip;
+#endif
 
-    /* Fetch up to the next page break; we'll fetch from the next page
-     * later if we have to. */
-    fetch_addr = svm_rip2pointer(v, &fetch_limit);
-    if ( vmcb->rip > fetch_limit )
-        return 0;
-    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
-    fetch_len = min_t(unsigned int, max_len,
-                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
-    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
+    ASSERT(v == current);
+    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+    hvm_emulate_init(&ctxt, NULL, 0);
+    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
+    if ( IS_ERR_OR_NULL(state) )
         return 0;
 
-    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
-    {
-        inst_len++;
-        if ( inst_len >= fetch_len )
-        {
-            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                        max_len - fetch_len) )
-                return 0;
-            fetch_len = max_len;
-        }
+    inst_len = x86_insn_length(state, &ctxt.ctxt);
+    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
+    x86_emulate_free_state(state);
+#ifndef NDEBUG
+    if ( vmcb->exitcode == VMEXIT_IOIO )
+        j = vmcb->exitinfo2 - vmcb->rip;
+    else
+        j = svm_nextrip_insn_length(v);
+    if ( j && j != inst_len )
+    {
+        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
+                ctxt.ctxt.opcode, inst_len, j);
+        return j;
     }
+#endif
 
     for ( j = 0; j < list_count; j++ )
     {
-        instr = list[j];
-        opcode = opc_bytes[instr];
+        enum instruction_index instr = list[j];
 
-        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
+        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
+        if ( opc_tab[instr].opcode == ctxt.ctxt.opcode )
         {
-            if ( (inst_len + i) >= fetch_len ) 
-            {
-                if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                            max_len - fetch_len) )
-                    return 0;
-                fetch_len = max_len;
-            }
+            if ( !opc_tab[instr].modrm.mod )
+                return inst_len;
 
-            if ( buf[inst_len+i] != opcode[i+1] )
-                goto mismatch;
+            if ( modrm_mod == opc_tab[instr].modrm.mod &&
+                 (modrm_rm & 7) == opc_tab[instr].modrm.rm &&
+                 (modrm_reg & 7) == opc_tab[instr].modrm.reg )
+                return inst_len;
         }
-        goto done;
-    mismatch: ;
     }
 
     gdprintk(XENLOG_WARNING,
-             "%s: Mismatch between expected and actual instruction bytes: "
+             "%s: Mismatch between expected and actual instruction: "
              "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     hvm_inject_hw_exception(TRAP_gp_fault, 0);
     return 0;
-
- done:
-    inst_len += opcode[0];
-    ASSERT(inst_len <= max_len);
-    return inst_len;
 }
 
 /*
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -382,7 +382,7 @@ struct operand {
     } mem;
 };
 #ifdef __x86_64__
-#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
+#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
 #else
 #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
@@ -1631,6 +1631,10 @@ struct x86_emulate_state {
 
     unsigned long eip;
     struct cpu_user_regs *regs;
+
+#ifndef NDEBUG
+    void *caller;
+#endif
 };
 
 /* Helper definitions. */
@@ -1658,6 +1662,11 @@ x86_decode_base(
 
     switch ( ctxt->opcode )
     {
+    case 0x90: /* nop / pause */
+        if ( repe_prefix() )
+            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
+        break;
+
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
         generate_exception_if(mode_64bit(), EXC_UD, -1);
@@ -2852,8 +2861,9 @@ x86_emulate(
         break;
 
     case 0x90: /* nop / xchg %%r8,%%rax */
+    case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */
         if ( !(rex_prefix & 1) )
-            break; /* nop */
+            break; /* nop / pause */
         /* fall through */
 
     case 0x91 ... 0x97: /* xchg reg,%%rax */
@@ -5200,3 +5210,89 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
+
+#ifdef __XEN__
+
+#include <xen/err.h>
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt))
+{
+    static DEFINE_PER_CPU(struct x86_emulate_state, state);
+    struct x86_emulate_state *state = &this_cpu(state);
+    const struct x86_emulate_ops ops = {
+        .insn_fetch = insn_fetch,
+        .read       = x86emul_unhandleable_rw,
+        .write      = REG_POISON,
+        .cmpxchg    = REG_POISON,
+    };
+    int rc = x86_decode(state, ctxt, &ops);
+
+    if ( unlikely(rc != X86EMUL_OKAY) )
+        return ERR_PTR(-rc);
+
+#ifndef NDEBUG
+    /*
+     * While we avoid memory allocation (by use of per-CPU data) above,
+     * nevertheless make sure callers properly release the state structure
+     * for forward compatibility.
+     */
+    if ( state->caller )
+    {
+        printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n",
+               state->caller);
+        dump_execution_state();
+    }
+    state->caller = __builtin_return_address(0);
+#endif
+
+    return state;
+}
+
+static inline void check_state(const struct x86_emulate_state *state)
+{
+#ifndef NDEBUG
+    ASSERT(state->caller);
+#endif
+}
+
+#ifndef NDEBUG
+void x86_emulate_free_state(struct x86_emulate_state *state)
+{
+    check_state(state);
+    state->caller = NULL;
+}
+#endif
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg)
+{
+    check_state(state);
+
+    if ( !(state->desc & ModRM) )
+        return -EINVAL;
+
+    if ( rm )
+        *rm = state->modrm_rm;
+    if ( reg )
+        *reg = state->modrm_reg;
+
+    return state->modrm_mod;
+}
+
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt)
+{
+    check_state(state);
+
+    return state->eip - ctxt->regs->eip;
+}
+
+#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -523,4 +523,29 @@ x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
+#ifdef __XEN__
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt));
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg);
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt);
+
+#ifdef NDEBUG
+static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+#else
+void x86_emulate_free_state(struct x86_emulate_state *state);
+#endif
+
+#endif
+
 #endif /* __X86_EMULATE_H__ */
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -53,6 +53,10 @@ void hvm_mem_access_emulate_one(enum emu
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 struct segment_register *hvmemul_get_seg_reg(
@@ -60,6 +64,11 @@ struct segment_register *hvmemul_get_seg
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
+int hvmemul_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt);
 int hvmemul_do_pio_buffer(uint16_t port,
                           unsigned int size,
                           uint8_t dir,



[-- Attachment #2: SVM-use-generic-decode.patch --]
[-- Type: text/plain, Size: 17406 bytes --]

SVM: use generic instruction decoding

... instead of custom handling. To facilitate this break out init code
from _hvm_emulate_one() into the new hvm_emulate_init(), and make
hvmemul_insn_fetch( globally available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -835,7 +835,7 @@ static int hvmemul_read(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-static int hvmemul_insn_fetch(
+int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
     void *p_data,
@@ -1765,15 +1765,14 @@ static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
-static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes)
 {
-    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
-    uint32_t new_intr_shadow, pfec = PFEC_page_present;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    unsigned int pfec = PFEC_page_present;
     unsigned long addr;
-    int rc;
 
     if ( hvm_long_mode_enabled(curr) &&
          hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
@@ -1791,14 +1790,14 @@ static int _hvm_emulate_one(struct hvm_e
     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip;
+    if ( !insn_bytes )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
-                                        regs->eip,
+                                        hvmemul_ctxt->insn_buf_eip,
                                         sizeof(hvmemul_ctxt->insn_buf),
                                         hvm_access_insn_fetch,
                                         hvmemul_ctxt->ctxt.addr_size,
@@ -1810,11 +1809,24 @@ static int _hvm_emulate_one(struct hvm_e
     }
     else
     {
-        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
     }
 
     hvmemul_ctxt->exn_pending = 0;
+}
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+
+    hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes);
+
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -15,7 +15,7 @@
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/config.h>
+#include <xen/err.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -26,41 +26,6 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-static unsigned int is_prefix(u8 opc)
-{
-    switch ( opc )
-    {
-    case 0x66:
-    case 0x67:
-    case 0x2E:
-    case 0x3E:
-    case 0x26:
-    case 0x64:
-    case 0x65:
-    case 0x36:
-    case 0xF0:
-    case 0xF3:
-    case 0xF2:
-    case 0x40 ... 0x4f:
-        return 1;
-    }
-    return 0;
-}
-
-static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit)
-{
-    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned long p = vmcb->cs.base + vmcb->rip;
-
-    if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) )
-    {
-        *limit = vmcb->cs.limit;
-        return (u32)p; /* mask to 32 bits */
-    }
-    *limit = ~0UL;
-    return p;
-}
-
 static unsigned long svm_nextrip_insn_length(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
     return vmcb->nextrip - vmcb->rip;
 }
 
-/* First byte: Length. Following bytes: Opcode bytes. */
-#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
-MAKE_INSTR(INVD,   2, 0x0f, 0x08);
-MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
-MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
-MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
-MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
-MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
-MAKE_INSTR(HLT,    1, 0xf4);
-MAKE_INSTR(INT3,   1, 0xcc);
-MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
-MAKE_INSTR(PAUSE,  1, 0x90);
-MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
-MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
-MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
-MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
-MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
-MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
-MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
-
-static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
-{
-    [INSTR_INVD]   = OPCODE_INVD,
-    [INSTR_WBINVD] = OPCODE_WBINVD,
-    [INSTR_CPUID]  = OPCODE_CPUID,
-    [INSTR_RDMSR]  = OPCODE_RDMSR,
-    [INSTR_WRMSR]  = OPCODE_WRMSR,
-    [INSTR_VMCALL] = OPCODE_VMCALL,
-    [INSTR_HLT]    = OPCODE_HLT,
-    [INSTR_INT3]   = OPCODE_INT3,
-    [INSTR_RDTSC]  = OPCODE_RDTSC,
-    [INSTR_PAUSE]  = OPCODE_PAUSE,
-    [INSTR_XSETBV] = OPCODE_XSETBV,
-    [INSTR_VMRUN]  = OPCODE_VMRUN,
-    [INSTR_VMLOAD] = OPCODE_VMLOAD,
-    [INSTR_VMSAVE] = OPCODE_VMSAVE,
-    [INSTR_STGI]   = OPCODE_STGI,
-    [INSTR_CLGI]   = OPCODE_CLGI,
-    [INSTR_INVLPGA] = OPCODE_INVLPGA,
+static const struct {
+    unsigned int opcode;
+    struct {
+        unsigned int rm:3;
+        unsigned int reg:3;
+        unsigned int mod:2;
+#define MODRM(mod, reg, rm) { rm, reg, mod }
+    } modrm;
+} const opc_tab[INSTR_MAX_COUNT] = {
+    [INSTR_PAUSE]  = { X86EMUL_OPC_F3(0, 0x90) },
+    [INSTR_INT3]   = { X86EMUL_OPC(   0, 0xcc) },
+    [INSTR_HLT]    = { X86EMUL_OPC(   0, 0xf4) },
+    [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
+    [INSTR_VMRUN]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
+    [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
+    [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
+    [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
+    [INSTR_STGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
+    [INSTR_CLGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
+    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
+    [INSTR_INVD]   = { X86EMUL_OPC(0x0f, 0x08) },
+    [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) },
+    [INSTR_WRMSR]  = { X86EMUL_OPC(0x0f, 0x30) },
+    [INSTR_RDTSC]  = { X86EMUL_OPC(0x0f, 0x31) },
+    [INSTR_RDMSR]  = { X86EMUL_OPC(0x0f, 0x32) },
+    [INSTR_CPUID]  = { X86EMUL_OPC(0x0f, 0xa2) },
 };
 
-static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
-                    unsigned long addr, unsigned int len)
-{
-    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
-
-    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
-    {
-    case HVMCOPY_okay:
-        break;
-    case HVMCOPY_bad_gva_to_gfn:
-        /* OK just to give up; we'll have injected #PF already */
-        return 0;
-    default:
-        /* Not OK: fetches from non-RAM pages are not supportable. */
-        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
-                 vmcb->rip, addr);
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 0;
-    }
-    return 1;
-}
-
 int __get_instruction_length_from_list(struct vcpu *v,
         const enum instruction_index *list, unsigned int list_count)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int i, j, inst_len = 0;
-    enum instruction_index instr = 0;
-    u8 buf[MAX_INST_LEN];
-    const u8 *opcode = NULL;
-    unsigned long fetch_addr, fetch_limit;
-    unsigned int fetch_len, max_len;
+    struct hvm_emulate_ctxt ctxt;
+    struct x86_emulate_state *state;
+    unsigned int inst_len, j, modrm_rm, modrm_reg;
+    int modrm_mod;
 
+#ifdef NDEBUG
     if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
         return inst_len;
 
     if ( vmcb->exitcode == VMEXIT_IOIO )
         return vmcb->exitinfo2 - vmcb->rip;
+#endif
 
-    /* Fetch up to the next page break; we'll fetch from the next page
-     * later if we have to. */
-    fetch_addr = svm_rip2pointer(v, &fetch_limit);
-    if ( vmcb->rip > fetch_limit )
-        return 0;
-    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
-    fetch_len = min_t(unsigned int, max_len,
-                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
-    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
+    ASSERT(v == current);
+    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+    hvm_emulate_init(&ctxt, NULL, 0);
+    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
+    if ( IS_ERR_OR_NULL(state) )
         return 0;
 
-    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
-    {
-        inst_len++;
-        if ( inst_len >= fetch_len )
-        {
-            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                        max_len - fetch_len) )
-                return 0;
-            fetch_len = max_len;
-        }
+    inst_len = x86_insn_length(state, &ctxt.ctxt);
+    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
+    x86_emulate_free_state(state);
+#ifndef NDEBUG
+    if ( vmcb->exitcode == VMEXIT_IOIO )
+        j = vmcb->exitinfo2 - vmcb->rip;
+    else
+        j = svm_nextrip_insn_length(v);
+    if ( j && j != inst_len )
+    {
+        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
+                ctxt.ctxt.opcode, inst_len, j);
+        return j;
     }
+#endif
 
     for ( j = 0; j < list_count; j++ )
     {
-        instr = list[j];
-        opcode = opc_bytes[instr];
+        enum instruction_index instr = list[j];
 
-        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
+        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
+        if ( opc_tab[instr].opcode == ctxt.ctxt.opcode )
         {
-            if ( (inst_len + i) >= fetch_len ) 
-            {
-                if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                            max_len - fetch_len) )
-                    return 0;
-                fetch_len = max_len;
-            }
+            if ( !opc_tab[instr].modrm.mod )
+                return inst_len;
 
-            if ( buf[inst_len+i] != opcode[i+1] )
-                goto mismatch;
+            if ( modrm_mod == opc_tab[instr].modrm.mod &&
+                 (modrm_rm & 7) == opc_tab[instr].modrm.rm &&
+                 (modrm_reg & 7) == opc_tab[instr].modrm.reg )
+                return inst_len;
         }
-        goto done;
-    mismatch: ;
     }
 
     gdprintk(XENLOG_WARNING,
-             "%s: Mismatch between expected and actual instruction bytes: "
+             "%s: Mismatch between expected and actual instruction: "
              "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     hvm_inject_hw_exception(TRAP_gp_fault, 0);
     return 0;
-
- done:
-    inst_len += opcode[0];
-    ASSERT(inst_len <= max_len);
-    return inst_len;
 }
 
 /*
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -382,7 +382,7 @@ struct operand {
     } mem;
 };
 #ifdef __x86_64__
-#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
+#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
 #else
 #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
@@ -1631,6 +1631,10 @@ struct x86_emulate_state {
 
     unsigned long eip;
     struct cpu_user_regs *regs;
+
+#ifndef NDEBUG
+    void *caller;
+#endif
 };
 
 /* Helper definitions. */
@@ -1658,6 +1662,11 @@ x86_decode_base(
 
     switch ( ctxt->opcode )
     {
+    case 0x90: /* nop / pause */
+        if ( repe_prefix() )
+            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
+        break;
+
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
         generate_exception_if(mode_64bit(), EXC_UD, -1);
@@ -2852,8 +2861,9 @@ x86_emulate(
         break;
 
     case 0x90: /* nop / xchg %%r8,%%rax */
+    case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */
         if ( !(rex_prefix & 1) )
-            break; /* nop */
+            break; /* nop / pause */
         /* fall through */
 
     case 0x91 ... 0x97: /* xchg reg,%%rax */
@@ -5200,3 +5210,89 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
+
+#ifdef __XEN__
+
+#include <xen/err.h>
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt))
+{
+    static DEFINE_PER_CPU(struct x86_emulate_state, state);
+    struct x86_emulate_state *state = &this_cpu(state);
+    const struct x86_emulate_ops ops = {
+        .insn_fetch = insn_fetch,
+        .read       = x86emul_unhandleable_rw,
+        .write      = REG_POISON,
+        .cmpxchg    = REG_POISON,
+    };
+    int rc = x86_decode(state, ctxt, &ops);
+
+    if ( unlikely(rc != X86EMUL_OKAY) )
+        return ERR_PTR(-rc);
+
+#ifndef NDEBUG
+    /*
+     * While we avoid memory allocation (by use of per-CPU data) above,
+     * nevertheless make sure callers properly release the state structure
+     * for forward compatibility.
+     */
+    if ( state->caller )
+    {
+        printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n",
+               state->caller);
+        dump_execution_state();
+    }
+    state->caller = __builtin_return_address(0);
+#endif
+
+    return state;
+}
+
+static inline void check_state(const struct x86_emulate_state *state)
+{
+#ifndef NDEBUG
+    ASSERT(state->caller);
+#endif
+}
+
+#ifndef NDEBUG
+void x86_emulate_free_state(struct x86_emulate_state *state)
+{
+    check_state(state);
+    state->caller = NULL;
+}
+#endif
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg)
+{
+    check_state(state);
+
+    if ( !(state->desc & ModRM) )
+        return -EINVAL;
+
+    if ( rm )
+        *rm = state->modrm_rm;
+    if ( reg )
+        *reg = state->modrm_reg;
+
+    return state->modrm_mod;
+}
+
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt)
+{
+    check_state(state);
+
+    return state->eip - ctxt->regs->eip;
+}
+
+#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -523,4 +523,29 @@ x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
+#ifdef __XEN__
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt));
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg);
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt);
+
+#ifdef NDEBUG
+static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+#else
+void x86_emulate_free_state(struct x86_emulate_state *state);
+#endif
+
+#endif
+
 #endif /* __X86_EMULATE_H__ */
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -53,6 +53,10 @@ void hvm_mem_access_emulate_one(enum emu
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 struct segment_register *hvmemul_get_seg_reg(
@@ -60,6 +64,11 @@ struct segment_register *hvmemul_get_seg
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
+int hvmemul_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt);
 int hvmemul_do_pio_buffer(uint16_t port,
                           unsigned int size,
                           uint8_t dir,

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 10/17] x86/32on64: use generic instruction decoding for call gate emulation
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (9 preceding siblings ...)
  2016-09-08 13:14 ` [PATCH 09/17] SVM: use generic instruction decoding Jan Beulich
@ 2016-09-08 13:16 ` Jan Beulich
  2016-09-08 13:17 ` [PATCH 11/17] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
                   ` (6 subsequent siblings)
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 14035 bytes --]

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3138,13 +3139,92 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( is_pv_32bit_vcpu(current) )
+        addr = (uint32_t)addr;
+
+    if ( !__addr_ok(addr) ||
+         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             goc->insn_fetch && cpu_has_nx
+                             ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3166,7 +3246,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3175,179 +3256,59 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
+            do_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
+            enum x86_segment seg;
+
+        case 5:
             ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
+            /* fall through */
+        case 3:
             ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
-            break;
-        case 0xff:
-            {
-                unsigned int modrm;
-
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-    {
-        do_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
+    if ( rc == X86EMUL_EXCEPTION )
+       return;
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3488,7 +3449,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5269,6 +5269,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5286,6 +5294,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -533,9 +533,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);



[-- Attachment #2: x86-32on64-gate-op-generic-decode.patch --]
[-- Type: text/plain, Size: 14103 bytes --]

x86/32on64: use generic instruction decoding for call gate emulation

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3138,13 +3139,92 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( is_pv_32bit_vcpu(current) )
+        addr = (uint32_t)addr;
+
+    if ( !__addr_ok(addr) ||
+         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             goc->insn_fetch && cpu_has_nx
+                             ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3166,7 +3246,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3175,179 +3256,59 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
+            do_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
+            enum x86_segment seg;
+
+        case 5:
             ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
+            /* fall through */
+        case 3:
             ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
-            break;
-        case 0xff:
-            {
-                unsigned int modrm;
-
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-    {
-        do_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
+    if ( rc == X86EMUL_EXCEPTION )
+       return;
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3488,7 +3449,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5269,6 +5269,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5286,6 +5294,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -533,9 +533,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 11/17] x86/PV: split out dealing with CRn from privileged instruction handling
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (10 preceding siblings ...)
  2016-09-08 13:16 ` [PATCH 10/17] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-09-08 13:17 ` Jan Beulich
  2016-09-08 13:17 ` [PATCH 12/17] x86/PV: split out dealing with DRn " Jan Beulich
                   ` (5 subsequent siblings)
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7040 bytes --]

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2242,6 +2242,107 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Read CR0 */
+        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+        return X86EMUL_OKAY;
+
+    case 2: /* Read CR2 */
+    case 4: /* Read CR4 */
+        *val = curr->arch.pv_vcpu.ctrlreg[reg];
+        return X86EMUL_OKAY;
+
+    case 3: /* Read CR3 */
+    {
+        const struct domain *currd = curr->domain;
+        unsigned long mfn;
+
+        if ( !is_pv_32bit_domain(currd) )
+        {
+            mfn = pagetable_get_pfn(curr->arch.guest_table);
+            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        else
+        {
+            l4_pgentry_t *pl4e =
+                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+            mfn = l4e_get_pfn(*pl4e);
+            unmap_domain_page(pl4e);
+            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        /* PTs should not be shared */
+        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+        return X86EMUL_OKAY;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Write CR0 */
+        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+        {
+            gdprintk(XENLOG_WARNING,
+                    "Attempt to change unmodifiable CR0 flags\n");
+            break;
+        }
+        do_fpu_taskswitch(!!(val & X86_CR0_TS));
+        return X86EMUL_OKAY;
+
+    case 2: /* Write CR2 */
+        curr->arch.pv_vcpu.ctrlreg[2] = val;
+        arch_set_cr2(curr, val);
+        return X86EMUL_OKAY;
+
+    case 3: /* Write CR3 */
+    {
+        struct domain *currd = curr->domain;
+        unsigned long gfn;
+        struct page_info *page;
+        int rc;
+
+        gfn = !is_pv_32bit_domain(currd)
+              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+        if ( !page )
+            break;
+        rc = new_guest_cr3(page_to_mfn(page));
+        put_page(page);
+
+        switch ( rc )
+        {
+        case 0:
+            return X86EMUL_OKAY;
+        case -ERESTART: /* retry after preemption */
+            return X86EMUL_RETRY;
+        }
+        break;
+    }
+
+    case 4: /* Write CR4 */
+        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+        write_cr4(pv_guest_cr4_to_real_cr4(curr));
+        ctxt_switch_levelling(curr);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2654,48 +2755,9 @@ static int emulate_privileged_op(struct
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
-        {
-        case 0: /* Read CR0 */
-            *reg = (read_cr0() & ~X86_CR0_TS) |
-                v->arch.pv_vcpu.ctrlreg[0];
-            break;
-
-        case 2: /* Read CR2 */
-            *reg = v->arch.pv_vcpu.ctrlreg[2];
-            break;
-            
-        case 3: /* Read CR3 */
-        {
-            unsigned long mfn;
-            
-            if ( !is_pv_32bit_domain(currd) )
-            {
-                mfn = pagetable_get_pfn(v->arch.guest_table);
-                *reg = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            else
-            {
-                l4_pgentry_t *pl4e =
-                    map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
-
-                mfn = l4e_get_pfn(*pl4e);
-                unmap_domain_page(pl4e);
-                *reg = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            /* PTs should not be shared */
-            BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
-        }
-        break;
-
-        case 4: /* Read CR4 */
-            *reg = v->arch.pv_vcpu.ctrlreg[4];
-            break;
-
-        default:
+        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        }
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
@@ -2719,56 +2781,12 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
+        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
         {
-        case 0: /* Write CR0 */
-            if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
-            {
-                gdprintk(XENLOG_WARNING,
-                        "Attempt to change unmodifiable CR0 flags.\n");
-                goto fail;
-            }
-            (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
-            break;
-
-        case 2: /* Write CR2 */
-            v->arch.pv_vcpu.ctrlreg[2] = *reg;
-            arch_set_cr2(v, *reg);
-            break;
-
-        case 3: {/* Write CR3 */
-            unsigned long gfn;
-            struct page_info *page;
-
-            gfn = !is_pv_32bit_domain(currd)
-                ? xen_cr3_to_pfn(*reg) : compat_cr3_to_pfn(*reg);
-            page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
-            if ( page )
-            {
-                rc = new_guest_cr3(page_to_mfn(page));
-                put_page(page);
-            }
-            else
-                rc = -EINVAL;
-
-            switch ( rc )
-            {
-            case 0:
-                break;
-            case -ERESTART: /* retry after preemption */
-                goto skip;
-            default:      /* not okay */
-                goto fail;
-            }
+        case X86EMUL_OKAY:
             break;
-        }
-
-        case 4: /* Write CR4 */
-            v->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg);
-            write_cr4(pv_guest_cr4_to_real_cr4(v));
-            ctxt_switch_levelling(v);
-            break;
-
+        case X86EMUL_RETRY: /* retry after preemption */
+            goto skip;
         default:
             goto fail;
         }



[-- Attachment #2: x86-PV-priv-op-split-CR.patch --]
[-- Type: text/plain, Size: 7111 bytes --]

x86/PV: split out dealing with CRn from privileged instruction handling

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2242,6 +2242,107 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Read CR0 */
+        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+        return X86EMUL_OKAY;
+
+    case 2: /* Read CR2 */
+    case 4: /* Read CR4 */
+        *val = curr->arch.pv_vcpu.ctrlreg[reg];
+        return X86EMUL_OKAY;
+
+    case 3: /* Read CR3 */
+    {
+        const struct domain *currd = curr->domain;
+        unsigned long mfn;
+
+        if ( !is_pv_32bit_domain(currd) )
+        {
+            mfn = pagetable_get_pfn(curr->arch.guest_table);
+            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        else
+        {
+            l4_pgentry_t *pl4e =
+                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+            mfn = l4e_get_pfn(*pl4e);
+            unmap_domain_page(pl4e);
+            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        /* PTs should not be shared */
+        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+        return X86EMUL_OKAY;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Write CR0 */
+        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+        {
+            gdprintk(XENLOG_WARNING,
+                    "Attempt to change unmodifiable CR0 flags\n");
+            break;
+        }
+        do_fpu_taskswitch(!!(val & X86_CR0_TS));
+        return X86EMUL_OKAY;
+
+    case 2: /* Write CR2 */
+        curr->arch.pv_vcpu.ctrlreg[2] = val;
+        arch_set_cr2(curr, val);
+        return X86EMUL_OKAY;
+
+    case 3: /* Write CR3 */
+    {
+        struct domain *currd = curr->domain;
+        unsigned long gfn;
+        struct page_info *page;
+        int rc;
+
+        gfn = !is_pv_32bit_domain(currd)
+              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+        if ( !page )
+            break;
+        rc = new_guest_cr3(page_to_mfn(page));
+        put_page(page);
+
+        switch ( rc )
+        {
+        case 0:
+            return X86EMUL_OKAY;
+        case -ERESTART: /* retry after preemption */
+            return X86EMUL_RETRY;
+        }
+        break;
+    }
+
+    case 4: /* Write CR4 */
+        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+        write_cr4(pv_guest_cr4_to_real_cr4(curr));
+        ctxt_switch_levelling(curr);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2654,48 +2755,9 @@ static int emulate_privileged_op(struct
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
-        {
-        case 0: /* Read CR0 */
-            *reg = (read_cr0() & ~X86_CR0_TS) |
-                v->arch.pv_vcpu.ctrlreg[0];
-            break;
-
-        case 2: /* Read CR2 */
-            *reg = v->arch.pv_vcpu.ctrlreg[2];
-            break;
-            
-        case 3: /* Read CR3 */
-        {
-            unsigned long mfn;
-            
-            if ( !is_pv_32bit_domain(currd) )
-            {
-                mfn = pagetable_get_pfn(v->arch.guest_table);
-                *reg = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            else
-            {
-                l4_pgentry_t *pl4e =
-                    map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
-
-                mfn = l4e_get_pfn(*pl4e);
-                unmap_domain_page(pl4e);
-                *reg = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            /* PTs should not be shared */
-            BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
-        }
-        break;
-
-        case 4: /* Read CR4 */
-            *reg = v->arch.pv_vcpu.ctrlreg[4];
-            break;
-
-        default:
+        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        }
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
@@ -2719,56 +2781,12 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
+        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
         {
-        case 0: /* Write CR0 */
-            if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
-            {
-                gdprintk(XENLOG_WARNING,
-                        "Attempt to change unmodifiable CR0 flags.\n");
-                goto fail;
-            }
-            (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
-            break;
-
-        case 2: /* Write CR2 */
-            v->arch.pv_vcpu.ctrlreg[2] = *reg;
-            arch_set_cr2(v, *reg);
-            break;
-
-        case 3: {/* Write CR3 */
-            unsigned long gfn;
-            struct page_info *page;
-
-            gfn = !is_pv_32bit_domain(currd)
-                ? xen_cr3_to_pfn(*reg) : compat_cr3_to_pfn(*reg);
-            page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
-            if ( page )
-            {
-                rc = new_guest_cr3(page_to_mfn(page));
-                put_page(page);
-            }
-            else
-                rc = -EINVAL;
-
-            switch ( rc )
-            {
-            case 0:
-                break;
-            case -ERESTART: /* retry after preemption */
-                goto skip;
-            default:      /* not okay */
-                goto fail;
-            }
+        case X86EMUL_OKAY:
             break;
-        }
-
-        case 4: /* Write CR4 */
-            v->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg);
-            write_cr4(pv_guest_cr4_to_real_cr4(v));
-            ctxt_switch_levelling(v);
-            break;
-
+        case X86EMUL_RETRY: /* retry after preemption */
+            goto skip;
         default:
             goto fail;
         }

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 12/17] x86/PV: split out dealing with DRn from privileged instruction handling
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (11 preceding siblings ...)
  2016-09-08 13:17 ` [PATCH 11/17] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
@ 2016-09-08 13:17 ` Jan Beulich
  2016-09-08 13:18 ` [PATCH 13/17] x86/PV: split out dealing with MSRs " Jan Beulich
                   ` (4 subsequent siblings)
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 2289 bytes --]

This is in preparation for using the generic emulator here.

Some care is needed temporarily to not unduly alter guest register
state: The local variable "res" can only go away once this code got
fully switched over to using x86_emulate().

Also switch to IS_ERR_VALUE() instead of (incorrectly) open coding it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2343,6 +2343,26 @@ static int priv_op_write_cr(unsigned int
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long res = do_get_debugreg(reg);
+
+    if ( IS_ERR_VALUE(res) )
+        return X86EMUL_UNHANDLEABLE;
+
+    *val = res;
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    return do_set_debugreg(reg, val) == 0
+           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2761,16 +2781,14 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
-        unsigned long res;
         opcode = insn_fetch(u8, code_base, eip, code_limit);
         if ( opcode < 0xc0 )
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
+        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        *reg = res;
         break;
     }
 
@@ -2799,7 +2817,7 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        if ( do_set_debugreg(modrm_reg, *reg) != 0 )
+        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
             goto fail;
         break;
 




[-- Attachment #2: x86-PV-priv-op-split-DR.patch --]
[-- Type: text/plain, Size: 2358 bytes --]

x86/PV: split out dealing with DRn from privileged instruction handling

This is in preparation for using the generic emulator here.

Some care is needed temporarily to not unduly alter guest register
state: The local variable "res" can only go away once this code got
fully switched over to using x86_emulate().

Also switch to IS_ERR_VALUE() instead of (incorrectly) open coding it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2343,6 +2343,26 @@ static int priv_op_write_cr(unsigned int
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long res = do_get_debugreg(reg);
+
+    if ( IS_ERR_VALUE(res) )
+        return X86EMUL_UNHANDLEABLE;
+
+    *val = res;
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    return do_set_debugreg(reg, val) == 0
+           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2761,16 +2781,14 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
-        unsigned long res;
         opcode = insn_fetch(u8, code_base, eip, code_limit);
         if ( opcode < 0xc0 )
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
+        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        *reg = res;
         break;
     }
 
@@ -2799,7 +2817,7 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        if ( do_set_debugreg(modrm_reg, *reg) != 0 )
+        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
             goto fail;
         break;
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 13/17] x86/PV: split out dealing with MSRs from privileged instruction handling
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (12 preceding siblings ...)
  2016-09-08 13:17 ` [PATCH 12/17] x86/PV: split out dealing with DRn " Jan Beulich
@ 2016-09-08 13:18 ` Jan Beulich
  2016-09-08 13:18 ` [PATCH 14/17] x86emul: support XSETBV Jan Beulich
                   ` (3 subsequent siblings)
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:18 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 25181 bytes --]

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2373,6 +2373,332 @@ static inline uint64_t guest_misc_enable
     return val;
 }
 
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdgsbase()
+                                : curr->arch.pv_vcpu.gs_base_kernel;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = curr->arch.pv_vcpu.gs_base_user;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( unlikely(is_cpufreq_controller(currd)) )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_UCODE_REV:
+        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+                break;
+            sync_core();
+        }
+        goto normal;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        *val = guest_misc_enable(*val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[0];
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_PERF_CAPABILITIES:
+        /* No extra capabilities are supported. */
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+            /* fall through */
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                /* Don't leak PMU MSRs to unprivileged domains. */
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    *val = 0;
+                else if ( vpmu_do_rdmsr(reg, val) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( rdmsr_hypervisor_regs(reg, val) )
+            return X86EMUL_OKAY;
+
+        rc = vmce_rdmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+        /* fall through */
+    case MSR_EFER:
+    normal:
+        /* Everyone can read the MSR space. */
+        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+#include "x86_64/mmconfig.h"
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        uint64_t temp;
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrfsbase(val);
+        curr->arch.pv_vcpu.fs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrgsbase(val);
+        curr->arch.pv_vcpu.gs_base_kernel = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) ||
+             wrmsr_safe(MSR_SHADOW_GS_BASE, val) )
+            break;
+        curr->arch.pv_vcpu.gs_base_user = val;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+    case MSR_K8_HWCR:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_NB_CFG:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_FAM10H_MMIO_CONF_BASE:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+            break;
+        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+             temp != val :
+             ((temp ^ val) &
+              ~(FAM10H_MMIO_CONF_ENABLE |
+                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_UCODE_REV:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val != guest_misc_enable(temp) )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MPERF:
+    case MSR_IA32_APERF:
+        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_PERF_CTL:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_THERM_CONTROL:
+    case MSR_IA32_ENERGY_PERF_BIAS:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[0] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(reg, val);
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    return X86EMUL_OKAY;
+
+                if ( vpmu_do_wrmsr(reg, val, 0) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+            return X86EMUL_OKAY;
+
+        rc = vmce_wrmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+
+        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+    invalid:
+            gdprintk(XENLOG_WARNING,
+                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+                     reg, temp, val);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 /* Instruction fetch with error handling. */
 #define insn_fetch(type, base, eip, limit)                                  \
 ({  unsigned long _rc, _ptr = (base) + (eip);                               \
@@ -2388,14 +2714,6 @@ static inline uint64_t guest_misc_enable
     }                                                                       \
     (eip) += sizeof(_x); _x; })
 
-static int is_cpufreq_controller(struct domain *d)
-{
-    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
-            is_hardware_domain(d));
-}
-
-#include "x86_64/mmconfig.h"
-
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -2420,7 +2738,6 @@ static int emulate_privileged_op(struct
     char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *);
     uint64_t val;
-    bool_t vpmu_msr;
 
     if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
         goto fail;
@@ -2821,188 +3138,11 @@ static int emulate_privileged_op(struct
             goto fail;
         break;
 
-    case 0x30: /* WRMSR */ {
-        uint32_t eax = regs->eax;
-        uint32_t edx = regs->edx;
-        uint64_t msr_content = ((uint64_t)edx << 32) | eax;
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrfsbase(msr_content);
-            v->arch.pv_vcpu.fs_base = msr_content;
-            break;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrgsbase(msr_content);
-            v->arch.pv_vcpu.gs_base_kernel = msr_content;
-            break;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) )
-                goto fail;
-            v->arch.pv_vcpu.gs_base_user = msr_content;
-            break;
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-        case MSR_K8_HWCR:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_AMD64_NB_CFG:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) ||
-                 (eax != (uint32_t)val) ||
-                 ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_FAM10H_MMIO_CONF_BASE:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) )
-                goto fail;
-            if (
-                 (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
-                 val != msr_content :
-                 ((val ^ msr_content) &
-                  ~( FAM10H_MMIO_CONF_ENABLE |
-                    (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
-                     FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
-                    ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
-                     FAM10H_MMIO_CONF_BASE_SHIFT))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_UCODE_REV:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            if ( msr_content )
-                goto invalid;
-            break;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            if ( msr_content != val )
-                goto invalid;
-            break;
-        case MSR_IA32_MPERF:
-        case MSR_IA32_APERF:
-            if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
-                ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content ) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_PERF_CTL:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_THERM_CONTROL:
-        case MSR_IA32_ENERGY_PERF_BIAS:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask[0] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask
-                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(regs->_ecx, msr_content);
-            break;
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                        break;
-
-                    if ( vpmu_do_wrmsr(regs->ecx, msr_content, 0) )
-                        goto fail;
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
-                break;
-
-            rc = vmce_wrmsr(regs->ecx, msr_content);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                break;
-
-            if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) )
-        invalid:
-                gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
-                        "0x%016"PRIx64" to 0x%016"PRIx64".\n",
-                        _p(regs->ecx), val, msr_content);
-            break;
-        }
+    case 0x30: /* WRMSR */
+        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
+                               NULL) != X86EMUL_OKAY )
+            goto fail;
         break;
-    }
 
     case 0x31: /* RDTSC */
         if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
@@ -3018,130 +3158,11 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x32: /* RDMSR */
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
-            goto rdmsr_writeback;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdgsbase()
-                                   : v->arch.pv_vcpu.gs_base_kernel;
-            goto rdmsr_writeback;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = v->arch.pv_vcpu.gs_base_user;
-            goto rdmsr_writeback;
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-            {
-                regs->eax = regs->edx = 0;
-                break;
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_UCODE_REV:
-            BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
-                    goto fail;
-                sync_core();
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            goto rdmsr_writeback;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask[0];
-            regs->edx = 0;
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask
-                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
-            regs->edx = 0;
-            break;
-        case MSR_IA32_PERF_CAPABILITIES:
-            /* No extra capabilities are supported */
-            regs->eax = regs->edx = 0;
-            break;
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                    {
-                        /* Don't leak PMU MSRs to unprivileged domains */
-                        regs->eax = regs->edx = 0;
-                        break;
-                    }
-
-                    if ( vpmu_do_rdmsr(regs->ecx, &val) )
-                        goto fail;
-
-                    regs->eax = (uint32_t)val;
-                    regs->edx = (uint32_t)(val >> 32);
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
-                goto rdmsr_writeback;
-
-            rc = vmce_rdmsr(regs->ecx, &val);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                goto rdmsr_writeback;
-
-        case MSR_EFER:
- rdmsr_normal:
-            /* Everyone can read the MSR space. */
-            /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
-                        _p(regs->ecx));*/
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
+        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
+            goto fail;
  rdmsr_writeback:
-            regs->eax = (uint32_t)val;
-            regs->edx = (uint32_t)(val >> 32);
-            break;
-        }
+        regs->eax = (uint32_t)val;
+        regs->edx = (uint32_t)(val >> 32);
         break;
 
     case 0xa2: /* CPUID */



[-- Attachment #2: x86-PV-priv-op-split-MSR.patch --]
[-- Type: text/plain, Size: 25253 bytes --]

x86/PV: split out dealing with MSRs from privileged instruction handling

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2373,6 +2373,332 @@ static inline uint64_t guest_misc_enable
     return val;
 }
 
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdgsbase()
+                                : curr->arch.pv_vcpu.gs_base_kernel;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = curr->arch.pv_vcpu.gs_base_user;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( unlikely(is_cpufreq_controller(currd)) )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_UCODE_REV:
+        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+                break;
+            sync_core();
+        }
+        goto normal;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        *val = guest_misc_enable(*val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[0];
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_PERF_CAPABILITIES:
+        /* No extra capabilities are supported. */
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+            /* fall through */
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                /* Don't leak PMU MSRs to unprivileged domains. */
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    *val = 0;
+                else if ( vpmu_do_rdmsr(reg, val) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( rdmsr_hypervisor_regs(reg, val) )
+            return X86EMUL_OKAY;
+
+        rc = vmce_rdmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+        /* fall through */
+    case MSR_EFER:
+    normal:
+        /* Everyone can read the MSR space. */
+        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+#include "x86_64/mmconfig.h"
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        uint64_t temp;
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrfsbase(val);
+        curr->arch.pv_vcpu.fs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrgsbase(val);
+        curr->arch.pv_vcpu.gs_base_kernel = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) ||
+             wrmsr_safe(MSR_SHADOW_GS_BASE, val) )
+            break;
+        curr->arch.pv_vcpu.gs_base_user = val;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+    case MSR_K8_HWCR:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_NB_CFG:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_FAM10H_MMIO_CONF_BASE:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+            break;
+        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+             temp != val :
+             ((temp ^ val) &
+              ~(FAM10H_MMIO_CONF_ENABLE |
+                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_UCODE_REV:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val != guest_misc_enable(temp) )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MPERF:
+    case MSR_IA32_APERF:
+        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_PERF_CTL:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_THERM_CONTROL:
+    case MSR_IA32_ENERGY_PERF_BIAS:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[0] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(reg, val);
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    return X86EMUL_OKAY;
+
+                if ( vpmu_do_wrmsr(reg, val, 0) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+            return X86EMUL_OKAY;
+
+        rc = vmce_wrmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+
+        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+    invalid:
+            gdprintk(XENLOG_WARNING,
+                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+                     reg, temp, val);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 /* Instruction fetch with error handling. */
 #define insn_fetch(type, base, eip, limit)                                  \
 ({  unsigned long _rc, _ptr = (base) + (eip);                               \
@@ -2388,14 +2714,6 @@ static inline uint64_t guest_misc_enable
     }                                                                       \
     (eip) += sizeof(_x); _x; })
 
-static int is_cpufreq_controller(struct domain *d)
-{
-    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
-            is_hardware_domain(d));
-}
-
-#include "x86_64/mmconfig.h"
-
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -2420,7 +2738,6 @@ static int emulate_privileged_op(struct
     char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *);
     uint64_t val;
-    bool_t vpmu_msr;
 
     if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
         goto fail;
@@ -2821,188 +3138,11 @@ static int emulate_privileged_op(struct
             goto fail;
         break;
 
-    case 0x30: /* WRMSR */ {
-        uint32_t eax = regs->eax;
-        uint32_t edx = regs->edx;
-        uint64_t msr_content = ((uint64_t)edx << 32) | eax;
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrfsbase(msr_content);
-            v->arch.pv_vcpu.fs_base = msr_content;
-            break;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrgsbase(msr_content);
-            v->arch.pv_vcpu.gs_base_kernel = msr_content;
-            break;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) )
-                goto fail;
-            v->arch.pv_vcpu.gs_base_user = msr_content;
-            break;
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-        case MSR_K8_HWCR:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_AMD64_NB_CFG:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) ||
-                 (eax != (uint32_t)val) ||
-                 ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_FAM10H_MMIO_CONF_BASE:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) )
-                goto fail;
-            if (
-                 (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
-                 val != msr_content :
-                 ((val ^ msr_content) &
-                  ~( FAM10H_MMIO_CONF_ENABLE |
-                    (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
-                     FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
-                    ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
-                     FAM10H_MMIO_CONF_BASE_SHIFT))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_UCODE_REV:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            if ( msr_content )
-                goto invalid;
-            break;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            if ( msr_content != val )
-                goto invalid;
-            break;
-        case MSR_IA32_MPERF:
-        case MSR_IA32_APERF:
-            if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
-                ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content ) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_PERF_CTL:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_THERM_CONTROL:
-        case MSR_IA32_ENERGY_PERF_BIAS:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask[0] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask
-                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(regs->_ecx, msr_content);
-            break;
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                        break;
-
-                    if ( vpmu_do_wrmsr(regs->ecx, msr_content, 0) )
-                        goto fail;
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
-                break;
-
-            rc = vmce_wrmsr(regs->ecx, msr_content);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                break;
-
-            if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) )
-        invalid:
-                gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
-                        "0x%016"PRIx64" to 0x%016"PRIx64".\n",
-                        _p(regs->ecx), val, msr_content);
-            break;
-        }
+    case 0x30: /* WRMSR */
+        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
+                               NULL) != X86EMUL_OKAY )
+            goto fail;
         break;
-    }
 
     case 0x31: /* RDTSC */
         if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
@@ -3018,130 +3158,11 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x32: /* RDMSR */
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
-            goto rdmsr_writeback;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdgsbase()
-                                   : v->arch.pv_vcpu.gs_base_kernel;
-            goto rdmsr_writeback;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = v->arch.pv_vcpu.gs_base_user;
-            goto rdmsr_writeback;
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-            {
-                regs->eax = regs->edx = 0;
-                break;
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_UCODE_REV:
-            BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
-                    goto fail;
-                sync_core();
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            goto rdmsr_writeback;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask[0];
-            regs->edx = 0;
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask
-                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
-            regs->edx = 0;
-            break;
-        case MSR_IA32_PERF_CAPABILITIES:
-            /* No extra capabilities are supported */
-            regs->eax = regs->edx = 0;
-            break;
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                    {
-                        /* Don't leak PMU MSRs to unprivileged domains */
-                        regs->eax = regs->edx = 0;
-                        break;
-                    }
-
-                    if ( vpmu_do_rdmsr(regs->ecx, &val) )
-                        goto fail;
-
-                    regs->eax = (uint32_t)val;
-                    regs->edx = (uint32_t)(val >> 32);
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
-                goto rdmsr_writeback;
-
-            rc = vmce_rdmsr(regs->ecx, &val);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                goto rdmsr_writeback;
-
-        case MSR_EFER:
- rdmsr_normal:
-            /* Everyone can read the MSR space. */
-            /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
-                        _p(regs->ecx));*/
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
+        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
+            goto fail;
  rdmsr_writeback:
-            regs->eax = (uint32_t)val;
-            regs->edx = (uint32_t)(val >> 32);
-            break;
-        }
+        regs->eax = (uint32_t)val;
+        regs->edx = (uint32_t)(val >> 32);
         break;
 
     case 0xa2: /* CPUID */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 14/17] x86emul: support XSETBV
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (13 preceding siblings ...)
  2016-09-08 13:18 ` [PATCH 13/17] x86/PV: split out dealing with MSRs " Jan Beulich
@ 2016-09-08 13:18 ` Jan Beulich
  2016-09-08 13:19 ` [PATCH 15/17] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
                   ` (2 subsequent siblings)
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:18 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1249 bytes --]

This is a prereq for switching PV privileged op emulation to the
generic instruction emulator. Since handle_xsetbv() is already capable
of dealing with all guest kinds, avoid introducing another hook here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4174,6 +4174,24 @@ x86_emulate(
 
         switch( modrm )
         {
+#ifdef __XEN__
+        case 0xd1: /* xsetbv */
+        {
+            unsigned long cr4;
+
+            if ( vex.pfx )
+                break;
+            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD, -1);
+            generate_exception_if(!mode_ring0() ||
+                                  handle_xsetbv(_regs._ecx,
+                                                _regs._eax | (_regs.rdx << 32)),
+                                  EXC_GP, 0);
+            goto no_writeback;
+        }
+#endif
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);




[-- Attachment #2: x86emul-XSETBV.patch --]
[-- Type: text/plain, Size: 1270 bytes --]

x86emul: support XSETBV

This is a prereq for switching PV privileged op emulation to the
generic instruction emulator. Since handle_xsetbv() is already capable
of dealing with all guest kinds, avoid introducing another hook here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4174,6 +4174,24 @@ x86_emulate(
 
         switch( modrm )
         {
+#ifdef __XEN__
+        case 0xd1: /* xsetbv */
+        {
+            unsigned long cr4;
+
+            if ( vex.pfx )
+                break;
+            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD, -1);
+            generate_exception_if(!mode_ring0() ||
+                                  handle_xsetbv(_regs._ecx,
+                                                _regs._eax | (_regs.rdx << 32)),
+                                  EXC_GP, 0);
+            goto no_writeback;
+        }
+#endif
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 15/17] x86emul: sort opcode 0f01 special case switch() statement
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (14 preceding siblings ...)
  2016-09-08 13:18 ` [PATCH 14/17] x86emul: support XSETBV Jan Beulich
@ 2016-09-08 13:19 ` Jan Beulich
  2016-09-08 13:20 ` [PATCH 16/17] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
  2016-09-08 13:21 ` [PATCH 17/17] x86emul: don't assume a memory operand Jan Beulich
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1876 bytes --]

Sort the special case opcode 0f01 entries numerically, insert blank
lines between each of the cases, and properly place opening braces.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4192,6 +4192,14 @@ x86_emulate(
         }
 #endif
 
+        case 0xd4: /* vmfunc */
+            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
+                                  EXC_UD, -1);
+            fail_if(ops->vmfunc == NULL);
+            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
+                goto done;
+            goto no_writeback;
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4200,7 +4208,9 @@ x86_emulate(
                                    ctxt)) )
                 goto done;
             goto no_writeback;
-        case 0xf9: /* rdtscp */ {
+
+        case 0xf9: /* rdtscp */
+        {
             uint64_t tsc_aux;
             fail_if(ops->read_msr == NULL);
             if ( (rc = ops->read_msr(MSR_TSC_AUX, &tsc_aux, ctxt)) != 0 )
@@ -4208,14 +4218,9 @@ x86_emulate(
             _regs.ecx = (uint32_t)tsc_aux;
             goto rdtsc;
         }
-        case 0xd4: /* vmfunc */
-            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
-                                  EXC_UD, -1);
-            fail_if(ops->vmfunc == NULL);
-            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
-                goto done;
-            goto no_writeback;
-	case 0xfc: /* clzero */ {
+
+        case 0xfc: /* clzero */
+        {
             unsigned int eax = 1, ebx = 0, dummy = 0;
             unsigned long zero = 0;
 




[-- Attachment #2: x86emul-sort-0f01.patch --]
[-- Type: text/plain, Size: 1931 bytes --]

x86emul: sort opcode 0f01 special case switch() statement

Sort the special case opcode 0f01 entries numerically, insert blank
lines between each of the cases, and properly place opening braces.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4192,6 +4192,14 @@ x86_emulate(
         }
 #endif
 
+        case 0xd4: /* vmfunc */
+            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
+                                  EXC_UD, -1);
+            fail_if(ops->vmfunc == NULL);
+            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
+                goto done;
+            goto no_writeback;
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4200,7 +4208,9 @@ x86_emulate(
                                    ctxt)) )
                 goto done;
             goto no_writeback;
-        case 0xf9: /* rdtscp */ {
+
+        case 0xf9: /* rdtscp */
+        {
             uint64_t tsc_aux;
             fail_if(ops->read_msr == NULL);
             if ( (rc = ops->read_msr(MSR_TSC_AUX, &tsc_aux, ctxt)) != 0 )
@@ -4208,14 +4218,9 @@ x86_emulate(
             _regs.ecx = (uint32_t)tsc_aux;
             goto rdtsc;
         }
-        case 0xd4: /* vmfunc */
-            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
-                                  EXC_UD, -1);
-            fail_if(ops->vmfunc == NULL);
-            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
-                goto done;
-            goto no_writeback;
-	case 0xfc: /* clzero */ {
+
+        case 0xfc: /* clzero */
+        {
             unsigned int eax = 1, ebx = 0, dummy = 0;
             unsigned long zero = 0;
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 16/17] x86/PV: use generic emulator for privileged instruction handling
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (15 preceding siblings ...)
  2016-09-08 13:19 ` [PATCH 15/17] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
@ 2016-09-08 13:20 ` Jan Beulich
  2016-09-08 13:21 ` [PATCH 17/17] x86emul: don't assume a memory operand Jan Beulich
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:20 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 47855 bytes --]

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment). Its handling in the epilogue
code involves moving the raising of the single step trap until after
registers were updated. This should probably have been that way from
the beginning, to allow the inject_hw_exception() hook to see updated
register state (in case it cares) - it's a trap, after all.

The other small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too. The read() / write() hook pointers get checked for that
purpose.

And finally handling of exceptions gets changed for REP INS / REP OUTS:
If the hook return X86EMUL_EXCEPTION, register state will still get
updated if some iterations have been performed (but the rIP update will
get suppressed if not all of them did get handled). While on the HVM side
the VA -> LA -> PA translation process clips the number of repetitions,
doing so would unduly complicate the PV side code being added here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
One thing to be considered is that despite avoiding the handling of
memory reads and writes (other than for INS and OUTS) the set of insns
now getting potentially handled by the emulator is much larger than
before. A possible solution to this would be a new hook to be called
between decode and execution stages, allowing further restrictions to
be enforced. Of course this could easily be a follow-up patch, as the
one here is quite big already.

Another thing to consider is to the extend the X86EMUL_EXCEPTION
handling change mentioned above to other string instructions. In that
case this should probably be broken out into a prereq patch.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -20,6 +20,9 @@ typedef bool bool_t;
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
+#define likely(x)   __builtin_expect(!!(x), true)
+#define unlikely(x) __builtin_expect(!!(x), false)
+
 #define __packed __attribute__((packed))
 
 /* For generic assembly code: use macros to define operation/operand sizes. */
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         hvm_inject_page_fault(pfec, addr);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 hvm_inject_page_fault(pfec, addr & PAGE_MASK);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -568,6 +569,7 @@ static int hvmemul_virtual_to_linear(
         return X86EMUL_UNHANDLEABLE;
 
     /* This is a singleton operation: fail it with an exception. */
+    *reps = 0;
     hvmemul_ctxt->exn_pending = 1;
     hvmemul_ctxt->trap.vector =
         (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -662,16 +662,13 @@ static void do_guest_trap(unsigned int t
                 trapstr(trapnr), trapnr, regs->error_code);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1272,7 +1269,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1305,7 +1302,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -1989,6 +1986,154 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static bool priv_op_to_linear(unsigned long base, unsigned long offset,
+                              unsigned int bytes, unsigned long limit,
+                              enum x86_segment seg,
+                              const struct x86_emulate_ctxt *ctxt,
+                              unsigned long *addr)
+{
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        if ( unlikely(limit < bytes - 1) ||
+             unlikely(offset > limit - bytes + 1) )
+        {
+            do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                          ctxt->regs);
+            return false;
+        }
+
+        *addr = (uint32_t)*addr;
+    }
+    else if ( unlikely(!__addr_ok(*addr)) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return false;
+    }
+
+    return true;
+}
+
+static int priv_op_insn_fetch(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                            x86_seg_cs, ctxt, &addr) )
+        return X86EMUL_EXCEPTION;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             cpu_has_nx ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default:
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.l   = 1;
+        reg->attr.fields.db  = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2242,6 +2387,234 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        do_guest_trap(TRAP_gp_fault, ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                x86_seg_es, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, PFEC_write_access);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                seg, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, 0);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2382,6 +2755,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2409,6 +2783,22 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior we defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2699,493 +3089,170 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        propagate_page_fault(_ptr + sizeof(_x) - _rc, 0);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
 
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
-    {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
+    return X86EMUL_OKAY;
+}
 
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
+static int priv_op_cpuid(unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct cpu_user_regs regs = *ctxt->regs;
+
+    regs._eax = *eax;
+    regs._ebx = *ebx;
+    regs._ecx = *ecx;
+    regs._edx = *edx;
+
+    pv_cpuid(&regs);
+
+    *eax = regs._eax;
+    *ebx = regs._ebx;
+    *ecx = regs._ecx;
+    *edx = regs._edx;
 
-        port = (u16)regs->edx;
+    return X86EMUL_OKAY;
+}
 
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
-                                     PFEC_write_access);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
+static int priv_op_hw_exception(uint8_t vector, int32_t error_code,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    do_guest_trap(vector, ctxt->regs);
 
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(esi)
-                                     + op_bytes - rc, 0);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
-        }
+    if ( error_code >= 0 )
+    {
+        struct trap_bounce *tb = &current->arch.pv_vcpu.trap_bounce;
 
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
+        tb->flags |= TBF_EXCEPTION_ERRCODE;
+        tb->error_code = error_code;
+    }
+
+    return X86EMUL_EXCEPTION;
+}
+
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .write               = x86emul_unhandleable_rw,
+    .cmpxchg             = x86emul_unhandleable_cx,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = priv_op_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+    .inject_hw_exception = priv_op_hw_exception,
+};
 
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-        goto done;
-    }
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
     /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
+     * Don't have x86_emulate() inject single step traps, as we want #DB
+     * also delivered for I/O break points (see below).
      */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+    if ( regs->_eflags & X86_EFLAGS_TF )
+    {
+        ctxt.bpmatch = DR_STEP;
+        regs->_eflags &= ~X86_EFLAGS_TF;
+    }
+    eflags = regs->_eflags;
 
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
+    /* Un-mirror virtualized state from EFLAGS. */
+    if ( (regs->_eflags ^ eflags) & X86_EFLAGS_IF )
     {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
+        /* The only allowed insns altering EFLAGS.IF are CLI/STI. */
+        ASSERT((ctxt.ctxt.opcode & ~1) == 0xfa);
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(curr, evtchn_upcall_mask) = (opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
     }
-
-    /* No decode of this single-byte opcode. */
-    goto fail;
-
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
-            break;
-        case 0xd1: /* XSETBV */
-        {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
-
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
+    regs->_eflags |= X86_EFLAGS_IF;
+    /* Nothing we allow to be emulated can change IOPL or TF. */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IOPL | X86_EFLAGS_TF)));
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+    if ( ctxt.bpmatch & DR_STEP )
+        regs->_eflags |= X86_EFLAGS_TF;
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
             {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
-
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
-
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
-
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+                uint64_t val = rdtsc();
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
-
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
-
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
-
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        pv_cpuid(regs);
-        break;
-
-    default:
-        goto fail;
+        /* fall through */
+    case X86EMUL_RETRY:
+    case X86EMUL_EXCEPTION:
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3515,7 +3583,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,7 +857,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -1075,7 +1079,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc != X86EMUL_DONE ? rc : X86EMUL_OKAY;
 
     /* Ensure that the TSS is valid and has an io-bitmap-offset field. */
     if ( !tr.attr.fields.p ||
@@ -1599,6 +1603,17 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_UNHANDLEABLE;
+}
+
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
@@ -2263,6 +2278,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
+    bool tf = ctxt->regs->eflags & EFLG_TF;
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
@@ -2718,14 +2734,10 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( ((nr_reps == 1) && (ops->write != x86emul_unhandleable_rw)) ||
+             !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2737,6 +2749,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2747,14 +2761,10 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( ((nr_reps == 1) && (ops->read != x86emul_unhandleable_rw)) ||
+             !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2768,6 +2778,8 @@ x86_emulate(
             _regs.esi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3026,6 +3038,7 @@ x86_emulate(
             dst.val = _regs.eax;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
         else if ( rc != X86EMUL_OKAY )
             goto done;
@@ -3842,7 +3855,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5195,11 +5212,6 @@ x86_emulate(
     }
 
  no_writeback:
-    /* Inject #DB if single-step tracing was enabled at instruction start. */
-    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
-         (ops->inject_hw_exception != NULL) )
-        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
-
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
@@ -5207,7 +5219,18 @@ x86_emulate(
     if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip    = _regs.eip;
+        ctxt->regs->eflags = _regs.eflags;
+        rc = X86EMUL_OKAY;
+    }
+
+    /* Inject #DB if single-step tracing was enabled at instruction start. */
+    if ( tf && (rc == X86EMUL_OKAY) && ops->inject_hw_exception )
+        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -111,6 +111,13 @@ struct __packed segment_register {
 #define X86EMUL_RETRY          3
  /* (cmpxchg accessor): CMPXCHG failed. Maps to X86EMUL_RETRY in caller. */
 #define X86EMUL_CMPXCHG_FAILED 3
+ /*
+  * Operation fully done by one of the hooks:
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when use anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -522,6 +529,15 @@ x86emul_unhandleable_rw(
     void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
+/* Unhandleable cmpxchg */
+int
+x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt);
 
 #ifdef __XEN__
 



[-- Attachment #2: x86-PV-priv-op-generic-emul.patch --]
[-- Type: text/plain, Size: 47919 bytes --]

x86/PV: use generic emulator for privileged instruction handling

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment). Its handling in the epilogue
code involves moving the raising of the single step trap until after
registers were updated. This should probably have been that way from
the beginning, to allow the inject_hw_exception() hook to see updated
register state (in case it cares) - it's a trap, after all.

The other small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too. The read() / write() hook pointers get checked for that
purpose.

And finally handling of exceptions gets changed for REP INS / REP OUTS:
If the hook return X86EMUL_EXCEPTION, register state will still get
updated if some iterations have been performed (but the rIP update will
get suppressed if not all of them did get handled). While on the HVM side
the VA -> LA -> PA translation process clips the number of repetitions,
doing so would unduly complicate the PV side code being added here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
One thing to be considered is that despite avoiding the handling of
memory reads and writes (other than for INS and OUTS) the set of insns
now getting potentially handled by the emulator is much larger than
before. A possible solution to this would be a new hook to be called
between decode and execution stages, allowing further restrictions to
be enforced. Of course this could easily be a follow-up patch, as the
one here is quite big already.

Another thing to consider is to the extend the X86EMUL_EXCEPTION
handling change mentioned above to other string instructions. In that
case this should probably be broken out into a prereq patch.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -20,6 +20,9 @@ typedef bool bool_t;
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
+#define likely(x)   __builtin_expect(!!(x), true)
+#define unlikely(x) __builtin_expect(!!(x), false)
+
 #define __packed __attribute__((packed))
 
 /* For generic assembly code: use macros to define operation/operand sizes. */
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         hvm_inject_page_fault(pfec, addr);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 hvm_inject_page_fault(pfec, addr & PAGE_MASK);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -568,6 +569,7 @@ static int hvmemul_virtual_to_linear(
         return X86EMUL_UNHANDLEABLE;
 
     /* This is a singleton operation: fail it with an exception. */
+    *reps = 0;
     hvmemul_ctxt->exn_pending = 1;
     hvmemul_ctxt->trap.vector =
         (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -662,16 +662,13 @@ static void do_guest_trap(unsigned int t
                 trapstr(trapnr), trapnr, regs->error_code);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1272,7 +1269,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1305,7 +1302,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -1989,6 +1986,154 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static bool priv_op_to_linear(unsigned long base, unsigned long offset,
+                              unsigned int bytes, unsigned long limit,
+                              enum x86_segment seg,
+                              const struct x86_emulate_ctxt *ctxt,
+                              unsigned long *addr)
+{
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        if ( unlikely(limit < bytes - 1) ||
+             unlikely(offset > limit - bytes + 1) )
+        {
+            do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                          ctxt->regs);
+            return false;
+        }
+
+        *addr = (uint32_t)*addr;
+    }
+    else if ( unlikely(!__addr_ok(*addr)) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return false;
+    }
+
+    return true;
+}
+
+static int priv_op_insn_fetch(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                            x86_seg_cs, ctxt, &addr) )
+        return X86EMUL_EXCEPTION;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             cpu_has_nx ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default:
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.l   = 1;
+        reg->attr.fields.db  = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2242,6 +2387,234 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        do_guest_trap(TRAP_gp_fault, ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                x86_seg_es, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, PFEC_write_access);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                seg, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, 0);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2382,6 +2755,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2409,6 +2783,22 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior we defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2699,493 +3089,170 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        propagate_page_fault(_ptr + sizeof(_x) - _rc, 0);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
 
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
-    {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
+    return X86EMUL_OKAY;
+}
 
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
+static int priv_op_cpuid(unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct cpu_user_regs regs = *ctxt->regs;
+
+    regs._eax = *eax;
+    regs._ebx = *ebx;
+    regs._ecx = *ecx;
+    regs._edx = *edx;
+
+    pv_cpuid(&regs);
+
+    *eax = regs._eax;
+    *ebx = regs._ebx;
+    *ecx = regs._ecx;
+    *edx = regs._edx;
 
-        port = (u16)regs->edx;
+    return X86EMUL_OKAY;
+}
 
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
-                                     PFEC_write_access);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
+static int priv_op_hw_exception(uint8_t vector, int32_t error_code,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    do_guest_trap(vector, ctxt->regs);
 
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(esi)
-                                     + op_bytes - rc, 0);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
-        }
+    if ( error_code >= 0 )
+    {
+        struct trap_bounce *tb = &current->arch.pv_vcpu.trap_bounce;
 
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
+        tb->flags |= TBF_EXCEPTION_ERRCODE;
+        tb->error_code = error_code;
+    }
+
+    return X86EMUL_EXCEPTION;
+}
+
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .write               = x86emul_unhandleable_rw,
+    .cmpxchg             = x86emul_unhandleable_cx,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = priv_op_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+    .inject_hw_exception = priv_op_hw_exception,
+};
 
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-        goto done;
-    }
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
     /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
+     * Don't have x86_emulate() inject single step traps, as we want #DB
+     * also delivered for I/O break points (see below).
      */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+    if ( regs->_eflags & X86_EFLAGS_TF )
+    {
+        ctxt.bpmatch = DR_STEP;
+        regs->_eflags &= ~X86_EFLAGS_TF;
+    }
+    eflags = regs->_eflags;
 
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
+    /* Un-mirror virtualized state from EFLAGS. */
+    if ( (regs->_eflags ^ eflags) & X86_EFLAGS_IF )
     {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
+        /* The only allowed insns altering EFLAGS.IF are CLI/STI. */
+        ASSERT((ctxt.ctxt.opcode & ~1) == 0xfa);
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(curr, evtchn_upcall_mask) = (opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
     }
-
-    /* No decode of this single-byte opcode. */
-    goto fail;
-
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
-            break;
-        case 0xd1: /* XSETBV */
-        {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
-
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
+    regs->_eflags |= X86_EFLAGS_IF;
+    /* Nothing we allow to be emulated can change IOPL or TF. */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IOPL | X86_EFLAGS_TF)));
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+    if ( ctxt.bpmatch & DR_STEP )
+        regs->_eflags |= X86_EFLAGS_TF;
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
             {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
-
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
-
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
-
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+                uint64_t val = rdtsc();
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
-
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
-
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
-
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        pv_cpuid(regs);
-        break;
-
-    default:
-        goto fail;
+        /* fall through */
+    case X86EMUL_RETRY:
+    case X86EMUL_EXCEPTION:
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3515,7 +3583,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,7 +857,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -1075,7 +1079,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc != X86EMUL_DONE ? rc : X86EMUL_OKAY;
 
     /* Ensure that the TSS is valid and has an io-bitmap-offset field. */
     if ( !tr.attr.fields.p ||
@@ -1599,6 +1603,17 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_UNHANDLEABLE;
+}
+
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
@@ -2263,6 +2278,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
+    bool tf = ctxt->regs->eflags & EFLG_TF;
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
@@ -2718,14 +2734,10 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( ((nr_reps == 1) && (ops->write != x86emul_unhandleable_rw)) ||
+             !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2737,6 +2749,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2747,14 +2761,10 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( ((nr_reps == 1) && (ops->read != x86emul_unhandleable_rw)) ||
+             !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2768,6 +2778,8 @@ x86_emulate(
             _regs.esi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3026,6 +3038,7 @@ x86_emulate(
             dst.val = _regs.eax;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
         else if ( rc != X86EMUL_OKAY )
             goto done;
@@ -3842,7 +3855,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5195,11 +5212,6 @@ x86_emulate(
     }
 
  no_writeback:
-    /* Inject #DB if single-step tracing was enabled at instruction start. */
-    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
-         (ops->inject_hw_exception != NULL) )
-        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
-
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
@@ -5207,7 +5219,18 @@ x86_emulate(
     if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip    = _regs.eip;
+        ctxt->regs->eflags = _regs.eflags;
+        rc = X86EMUL_OKAY;
+    }
+
+    /* Inject #DB if single-step tracing was enabled at instruction start. */
+    if ( tf && (rc == X86EMUL_OKAY) && ops->inject_hw_exception )
+        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -111,6 +111,13 @@ struct __packed segment_register {
 #define X86EMUL_RETRY          3
  /* (cmpxchg accessor): CMPXCHG failed. Maps to X86EMUL_RETRY in caller. */
 #define X86EMUL_CMPXCHG_FAILED 3
+ /*
+  * Operation fully done by one of the hooks:
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when use anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -522,6 +529,15 @@ x86emul_unhandleable_rw(
     void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
+/* Unhandleable cmpxchg */
+int
+x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt);
 
 #ifdef __XEN__
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 17/17] x86emul: don't assume a memory operand
  2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
                   ` (16 preceding siblings ...)
  2016-09-08 13:20 ` [PATCH 16/17] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
@ 2016-09-08 13:21 ` Jan Beulich
  17 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:21 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 3848 bytes --]

Especially for x86_insn_operand_ea() to return dependable segment
information even when the caller didn't consider applicability we
shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
and set it to OP_MEM when we actually encounter memory like operands.

This requires to eliminate the XSA-123 fix, which has been no longer
necessary since the elimination of the union in commit dd766684e7. That
in turn allows restricting the scope of override_seg to x86_decode().
At this occasion also make it have a proper type, instead of plain int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1632,7 +1632,6 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
-    int override_seg;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -1664,7 +1663,6 @@ struct x86_emulate_state {
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
-#define override_seg (state->override_seg)
 #define ea (state->ea)
 
 static int
@@ -1693,6 +1691,7 @@ x86_decode_base(
     case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
     case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         /* Source EA is not encoded via ModRM. */
+        ea.type = OP_MEM;
         ea.mem.off = insn_fetch_bytes(ad_bytes);
         break;
 
@@ -1783,11 +1782,11 @@ x86_decode(
 {
     uint8_t b, d, sib, sib_index, sib_base;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
+    enum x86_segment override_seg = x86_seg_none;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
-    override_seg = -1;
-    ea.type = OP_MEM;
+    ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
     state->regs = ctxt->regs;
@@ -2085,6 +2084,7 @@ x86_decode(
         else if ( ad_bytes == 2 )
         {
             /* 16-bit ModR/M decode. */
+            ea.type = OP_MEM;
             switch ( modrm_rm )
             {
             case 0:
@@ -2135,6 +2135,7 @@ x86_decode(
         else
         {
             /* 32/64-bit ModR/M decode. */
+            ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
                 sib = insn_fetch_type(uint8_t);
@@ -2199,7 +2200,7 @@ x86_decode(
         }
     }
 
-    if ( override_seg != -1 && ea.type == OP_MEM )
+    if ( override_seg != x86_seg_none )
         ea.mem.seg = override_seg;
 
     /* Fetch the immediate operand, if present. */
@@ -4250,13 +4251,11 @@ x86_emulate(
             generate_exception_if(limit < sizeof(long) ||
                                   (limit & (limit - 1)), EXC_UD, -1);
             base &= ~(limit - 1);
-            if ( override_seg == -1 )
-                override_seg = x86_seg_ds;
             if ( ops->rep_stos )
             {
                 unsigned long nr_reps = limit / sizeof(zero);
 
-                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
+                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
                                    &nr_reps, ctxt);
                 if ( rc == X86EMUL_OKAY )
                 {
@@ -4268,7 +4267,7 @@ x86_emulate(
             }
             while ( limit )
             {
-                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
+                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 base += sizeof(zero);
@@ -5254,7 +5253,6 @@ x86_emulate(
 #undef rex_prefix
 #undef lock_prefix
 #undef vex
-#undef override_seg
 #undef ea
 
 #ifdef __XEN__




[-- Attachment #2: x86emul-ea-init-OP_NONE.patch --]
[-- Type: text/plain, Size: 3884 bytes --]

x86emul: don't assume a memory operand

Especially for x86_insn_operand_ea() to return dependable segment
information even when the caller didn't consider applicability we
shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
and set it to OP_MEM when we actually encounter memory like operands.

This requires to eliminate the XSA-123 fix, which has been no longer
necessary since the elimination of the union in commit dd766684e7. That
in turn allows restricting the scope of override_seg to x86_decode().
At this occasion also make it have a proper type, instead of plain int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1632,7 +1632,6 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
-    int override_seg;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -1664,7 +1663,6 @@ struct x86_emulate_state {
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
-#define override_seg (state->override_seg)
 #define ea (state->ea)
 
 static int
@@ -1693,6 +1691,7 @@ x86_decode_base(
     case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
     case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         /* Source EA is not encoded via ModRM. */
+        ea.type = OP_MEM;
         ea.mem.off = insn_fetch_bytes(ad_bytes);
         break;
 
@@ -1783,11 +1782,11 @@ x86_decode(
 {
     uint8_t b, d, sib, sib_index, sib_base;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
+    enum x86_segment override_seg = x86_seg_none;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
-    override_seg = -1;
-    ea.type = OP_MEM;
+    ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
     state->regs = ctxt->regs;
@@ -2085,6 +2084,7 @@ x86_decode(
         else if ( ad_bytes == 2 )
         {
             /* 16-bit ModR/M decode. */
+            ea.type = OP_MEM;
             switch ( modrm_rm )
             {
             case 0:
@@ -2135,6 +2135,7 @@ x86_decode(
         else
         {
             /* 32/64-bit ModR/M decode. */
+            ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
                 sib = insn_fetch_type(uint8_t);
@@ -2199,7 +2200,7 @@ x86_decode(
         }
     }
 
-    if ( override_seg != -1 && ea.type == OP_MEM )
+    if ( override_seg != x86_seg_none )
         ea.mem.seg = override_seg;
 
     /* Fetch the immediate operand, if present. */
@@ -4250,13 +4251,11 @@ x86_emulate(
             generate_exception_if(limit < sizeof(long) ||
                                   (limit & (limit - 1)), EXC_UD, -1);
             base &= ~(limit - 1);
-            if ( override_seg == -1 )
-                override_seg = x86_seg_ds;
             if ( ops->rep_stos )
             {
                 unsigned long nr_reps = limit / sizeof(zero);
 
-                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
+                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
                                    &nr_reps, ctxt);
                 if ( rc == X86EMUL_OKAY )
                 {
@@ -4268,7 +4267,7 @@ x86_emulate(
             }
             while ( limit )
             {
-                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
+                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 base += sizeof(zero);
@@ -5254,7 +5253,6 @@ x86_emulate(
 #undef rex_prefix
 #undef lock_prefix
 #undef vex
-#undef override_seg
 #undef ea
 
 #ifdef __XEN__

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: track only rIP in emulator state
  2016-09-08 13:08 ` [PATCH 04/17] x86emul: track only rIP in emulator state Jan Beulich
@ 2016-09-08 13:23   ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

>>> On 08.09.16 at 15:08, <JBeulich@suse.com> wrote:

Please disregard this one - it got sent out with the wrong number in the subject.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 07/17] x86emul: move x86_execute() common epilogue code
  2016-09-08 13:13 ` [PATCH 07/17] x86emul: move x86_execute() common epilogue code Jan Beulich
@ 2016-09-08 13:28   ` Jan Beulich
  2016-09-14 17:13   ` Andrew Cooper
  1 sibling, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-08 13:28 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

>>> On 08.09.16 at 15:13, <JBeulich@suse.com> wrote:
> Only code movement, no functional change.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Just noticed the title was left stale - should really be "x86emul:
move x86_emulate() common epilogue code".

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/17] x86emul: split instruction decoding from execution
  2016-09-08 13:04 ` [PATCH 01/17] x86emul: split instruction decoding from execution Jan Beulich
@ 2016-09-09 18:35   ` Andrew Cooper
  2016-09-12  7:20     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-09 18:35 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:04, Jan Beulich wrote:
> This is only the mechanical part, a subsequent patch will make non-
> mechanical adjustments to actually do all decoding in this new
> function.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -48,7 +48,9 @@
>  /* All operands are implicit in the opcode. */
>  #define ImplicitOps (DstImplicit|SrcImplicit)
>  
> -static uint8_t opcode_table[256] = {
> +typedef uint8_t opcode_desc_t;
> +
> +static const opcode_desc_t opcode_table[256] = {
>      /* 0x00 - 0x07 */
>      ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
>      ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
> @@ -178,7 +180,7 @@ static uint8_t opcode_table[256] = {
>      ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
>  };
>  
> -static uint8_t twobyte_table[256] = {
> +static const opcode_desc_t twobyte_table[256] = {
>      /* 0x00 - 0x07 */
>      SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
>      /* 0x08 - 0x0F */
> @@ -607,7 +609,7 @@ do{ asm volatile (
>  })
>  #define truncate_ea(ea) truncate_word((ea), ad_bytes)
>  
> -#define mode_64bit() (def_ad_bytes == 8)
> +#define mode_64bit() (ctxt->addr_size == 64)
>  
>  #define fail_if(p)                                      \
>  do {                                                    \
> @@ -1558,32 +1560,63 @@ int x86emul_unhandleable_rw(
>      return X86EMUL_UNHANDLEABLE;
>  }
>  
> -int
> -x86_emulate(
> -    struct x86_emulate_ctxt *ctxt,
> -    const struct x86_emulate_ops  *ops)
> -{
> -    /* Shadow copy of register state. Committed on successful emulation. */
> -    struct cpu_user_regs _regs = *ctxt->regs;
> +struct x86_emulate_state {
> +    unsigned int op_bytes, ad_bytes;
> +
> +    enum { ext_none, ext_0f, ext_0f38 } ext;
> +    uint8_t opcode;
> +    uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
> +    uint8_t rex_prefix;
> +    bool lock_prefix;
> +    opcode_desc_t desc;
> +    union vex vex;
> +    int override_seg;
>  
> -    uint8_t b, d, sib, sib_index, sib_base, rex_prefix = 0;
> -    uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
> -    enum { ext_none, ext_0f, ext_0f38 } ext = ext_none;
> -    union vex vex = {};
> -    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
> -    bool_t lock_prefix = 0;
> -    int override_seg = -1, rc = X86EMUL_OKAY;
> -    struct operand src = { .reg = REG_POISON };
> -    struct operand dst = { .reg = REG_POISON };
> -    enum x86_swint_type swint_type;
> -    struct x86_emulate_stub stub = {};
> -    DECLARE_ALIGNED(mmval_t, mmval);
>      /*
>       * Data operand effective address (usually computed from ModRM).
>       * Default is a memory operand relative to segment DS.
>       */
> -    struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
> -    ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
> +    struct operand ea;
> +
> +    /* Immediate operand values, if any. Use otherwise unused fields. */
> +#define imm1 ea.val
> +#define imm2 ea.orig_val

Some instructions (e.g. bextr) have both immediate and memory operands. 
Reusing ea like this is unsafe.

Immediate data was previously stashed in src, separately from ea.  In
the light of the XSA-123 problems, I think it would be better to just
have "uint64_t imm1, imm2;" here.

> +
> +    /* Shadow copy of register state. Committed on successful emulation. */
> +    struct cpu_user_regs regs;
> +};
> +
> +/* Helper definitions. */
> +#define op_bytes (state->op_bytes)
> +#define ad_bytes (state->ad_bytes)
> +#define ext (state->ext)
> +#define modrm (state->modrm)
> +#define modrm_mod (state->modrm_mod)
> +#define modrm_reg (state->modrm_reg)
> +#define modrm_rm (state->modrm_rm)
> +#define rex_prefix (state->rex_prefix)
> +#define lock_prefix (state->lock_prefix)
> +#define vex (state->vex)
> +#define override_seg (state->override_seg)
> +#define ea (state->ea)
> +#define _regs (state->regs)
> +
> +static int
> +x86_decode(
> +    struct x86_emulate_state *state,
> +    struct x86_emulate_ctxt *ctxt,
> +    const struct x86_emulate_ops  *ops)
> +{
> +    uint8_t b, d, sib, sib_index, sib_base;
> +    unsigned int def_op_bytes, def_ad_bytes;
> +    int rc = X86EMUL_OKAY;
> +
> +    memset(state, 0, sizeof(*state));
> +    override_seg = -1;
> +    ea.type = OP_MEM;
> +    ea.mem.seg = x86_seg_ds;
> +    ea.reg = REG_POISON;
> +    _regs = *ctxt->regs;

The helper definitions are fine for the transition period, but I would
like to see them eventually removed to help reduce the quantity of
information hiding in this area.  Please don't introduce new uses.

>  
>      ctxt->retire.byte = 0;
>  
> @@ -1800,7 +1833,7 @@ x86_emulate(
>                      d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
>                  break;
>              default: /* Until it is worth making this table based ... */
> -                goto cannot_emulate;
> +                return X86EMUL_UNHANDLEABLE;
>              }
>              break;
>  
> @@ -1932,6 +1965,61 @@ x86_emulate(
>      if ( override_seg != -1 && ea.type == OP_MEM )
>          ea.mem.seg = override_seg;
>  
> +    /* Fetch the immediate operand, if present. */
> +    switch ( d & SrcMask )
> +    {
> +        unsigned int bytes;
> +
> +    case SrcImm:
> +        if ( !(d & ByteOp) )
> +            bytes = op_bytes != 8 ? op_bytes : 4;
> +        else
> +        {
> +    case SrcImmByte:
> +            bytes = 1;
> +        }
> +        /* NB. Immediates are sign-extended as necessary. */
> +        switch ( bytes )
> +        {
> +        case 1: imm1 = insn_fetch_type(int8_t);  break;
> +        case 2: imm1 = insn_fetch_type(int16_t); break;
> +        case 4: imm1 = insn_fetch_type(int32_t); break;
> +        }
> +        break;
> +    case SrcImm16:
> +        imm1 = insn_fetch_type(uint16_t);
> +        break;
> +    }
> +
> +    state->opcode = b;
> +    state->desc = d;
> +
> + done:
> +    return rc;
> +}
> +
> +int
> +x86_emulate(
> +    struct x86_emulate_ctxt *ctxt,
> +    const struct x86_emulate_ops *ops)
> +{
> +    struct x86_emulate_state state;
> +    int rc;
> +    uint8_t b, d;
> +    struct operand src = { .reg = REG_POISON };
> +    struct operand dst = { .reg = REG_POISON };
> +    enum x86_swint_type swint_type;
> +    struct x86_emulate_stub stub = {};
> +    DECLARE_ALIGNED(mmval_t, mmval);
> +
> +    rc = x86_decode(&state, ctxt, ops);
> +    if ( rc != X86EMUL_OKAY)

Space before the bracket (although I guess these lines drop out before
the end of the series anyway?)

~Andrew

> +        return rc;
> +
> +    b = state.opcode;
> +    d = state.desc;
> +#define state (&state)
> +
>      /* Decode and fetch the source operand: register, memory or immediate. */
>      switch ( d & SrcMask )
>      {
> @@ -1987,18 +2075,12 @@ x86_emulate(
>              src.bytes = 1;
>          }
>          src.type  = OP_IMM;
> -        /* NB. Immediates are sign-extended as necessary. */
> -        switch ( src.bytes )
> -        {
> -        case 1: src.val = insn_fetch_type(int8_t);  break;
> -        case 2: src.val = insn_fetch_type(int16_t); break;
> -        case 4: src.val = insn_fetch_type(int32_t); break;
> -        }
> +        src.val   = imm1;
>          break;
>      case SrcImm16:
>          src.type  = OP_IMM;
>          src.bytes = 2;
> -        src.val   = insn_fetch_type(uint16_t);
> +        src.val   = imm1;
>          break;
>      }
>  
> @@ -3892,8 +3974,8 @@ x86_emulate(
>      /* Commit shadow register state. */
>      _regs.eflags &= ~EFLG_RF;
>  
> -    /* Zero the upper 32 bits of %rip if not in long mode. */
> -    if ( def_ad_bytes < sizeof(_regs.eip) )
> +    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
> +    if ( !mode_64bit() )
>          _regs.eip = (uint32_t)_regs.eip;
>  
>      *ctxt->regs = _regs;
> @@ -4876,4 +4958,19 @@ x86_emulate(
>      _put_fpu();
>      put_stub(stub);
>      return X86EMUL_UNHANDLEABLE;
> +#undef state
>  }
> +
> +#undef op_bytes
> +#undef ad_bytes
> +#undef ext
> +#undef modrm
> +#undef modrm_mod
> +#undef modrm_reg
> +#undef modrm_rm
> +#undef rex_prefix
> +#undef lock_prefix
> +#undef vex
> +#undef override_seg
> +#undef ea
> +#undef _regs
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/17] x86emul: split instruction decoding from execution
  2016-09-09 18:35   ` Andrew Cooper
@ 2016-09-12  7:20     ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-12  7:20 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 09.09.16 at 20:35, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:04, Jan Beulich wrote:
>> @@ -607,7 +609,7 @@ do{ asm volatile (
>>  })
>>  #define truncate_ea(ea) truncate_word((ea), ad_bytes)
>>  
>> -#define mode_64bit() (def_ad_bytes == 8)
>> +#define mode_64bit() (ctxt->addr_size == 64)
>>  
>>  #define fail_if(p)                                      \
>>  do {                                                    \
>> @@ -1558,32 +1560,63 @@ int x86emul_unhandleable_rw(
>>      return X86EMUL_UNHANDLEABLE;
>>  }
>>  
>> -int
>> -x86_emulate(
>> -    struct x86_emulate_ctxt *ctxt,
>> -    const struct x86_emulate_ops  *ops)
>> -{
>> -    /* Shadow copy of register state. Committed on successful emulation. */
>> -    struct cpu_user_regs _regs = *ctxt->regs;
>> +struct x86_emulate_state {
>> +    unsigned int op_bytes, ad_bytes;
>> +
>> +    enum { ext_none, ext_0f, ext_0f38 } ext;
>> +    uint8_t opcode;
>> +    uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
>> +    uint8_t rex_prefix;
>> +    bool lock_prefix;
>> +    opcode_desc_t desc;
>> +    union vex vex;
>> +    int override_seg;
>>  
>> -    uint8_t b, d, sib, sib_index, sib_base, rex_prefix = 0;
>> -    uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
>> -    enum { ext_none, ext_0f, ext_0f38 } ext = ext_none;
>> -    union vex vex = {};
>> -    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
>> -    bool_t lock_prefix = 0;
>> -    int override_seg = -1, rc = X86EMUL_OKAY;
>> -    struct operand src = { .reg = REG_POISON };
>> -    struct operand dst = { .reg = REG_POISON };
>> -    enum x86_swint_type swint_type;
>> -    struct x86_emulate_stub stub = {};
>> -    DECLARE_ALIGNED(mmval_t, mmval);
>>      /*
>>       * Data operand effective address (usually computed from ModRM).
>>       * Default is a memory operand relative to segment DS.
>>       */
>> -    struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
>> -    ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
>> +    struct operand ea;
>> +
>> +    /* Immediate operand values, if any. Use otherwise unused fields. */
>> +#define imm1 ea.val
>> +#define imm2 ea.orig_val
> 
> Some instructions (e.g. bextr) have both immediate and memory operands. 
> Reusing ea like this is unsafe.

I disagree: Both fields have never been used for anything, and it was
more than once that I had thought about how to eliminate them from
ea without eliminating them from src and dst. Until finally I found a use
for them here.

> Immediate data was previously stashed in src, separately from ea.  In
> the light of the XSA-123 problems, I think it would be better to just
> have "uint64_t imm1, imm2;" here.

The XSA-123 problem was completely different, and I specifically took
this into consideration. We're not aliasing scalars and pointers here,
so there's no risk of causing a new security issue. Just to repeat -
these two fields have been completely unused so far.

>> +
>> +    /* Shadow copy of register state. Committed on successful emulation. */
>> +    struct cpu_user_regs regs;
>> +};
>> +
>> +/* Helper definitions. */
>> +#define op_bytes (state->op_bytes)
>> +#define ad_bytes (state->ad_bytes)
>> +#define ext (state->ext)
>> +#define modrm (state->modrm)
>> +#define modrm_mod (state->modrm_mod)
>> +#define modrm_reg (state->modrm_reg)
>> +#define modrm_rm (state->modrm_rm)
>> +#define rex_prefix (state->rex_prefix)
>> +#define lock_prefix (state->lock_prefix)
>> +#define vex (state->vex)
>> +#define override_seg (state->override_seg)
>> +#define ea (state->ea)
>> +#define _regs (state->regs)
>> +
>> +static int
>> +x86_decode(
>> +    struct x86_emulate_state *state,
>> +    struct x86_emulate_ctxt *ctxt,
>> +    const struct x86_emulate_ops  *ops)
>> +{
>> +    uint8_t b, d, sib, sib_index, sib_base;
>> +    unsigned int def_op_bytes, def_ad_bytes;
>> +    int rc = X86EMUL_OKAY;
>> +
>> +    memset(state, 0, sizeof(*state));
>> +    override_seg = -1;
>> +    ea.type = OP_MEM;
>> +    ea.mem.seg = x86_seg_ds;
>> +    ea.reg = REG_POISON;
>> +    _regs = *ctxt->regs;
> 
> The helper definitions are fine for the transition period, but I would
> like to see them eventually removed to help reduce the quantity of
> information hiding in this area.  Please don't introduce new uses.

I simply can't write e.g. "state->ea.type" here, as that would get
macro expanded to "state->(state->ea).type". I've carefully tried to
avoid introducing new uses where possible, and I'll be happy to
correct cases where I failed to, but here we just can't (and I hope
you agree that it would be odd to not use the helpers consistently
in a single block of code, i.e. I'd like to not replace _regs (which goes
away in patch 3 anyway).

>> +int
>> +x86_emulate(
>> +    struct x86_emulate_ctxt *ctxt,
>> +    const struct x86_emulate_ops *ops)
>> +{
>> +    struct x86_emulate_state state;
>> +    int rc;
>> +    uint8_t b, d;
>> +    struct operand src = { .reg = REG_POISON };
>> +    struct operand dst = { .reg = REG_POISON };
>> +    enum x86_swint_type swint_type;
>> +    struct x86_emulate_stub stub = {};
>> +    DECLARE_ALIGNED(mmval_t, mmval);
>> +
>> +    rc = x86_decode(&state, ctxt, ops);
>> +    if ( rc != X86EMUL_OKAY)
> 
> Space before the bracket (although I guess these lines drop out before
> the end of the series anyway?)

Oops. And to the question - no, why would they? Emulation obviously
requires decoding as the first step.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase
  2016-09-08 13:07 ` [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase Jan Beulich
@ 2016-09-13 18:44   ` Andrew Cooper
  2016-09-14  9:55     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-13 18:44 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:07, Jan Beulich wrote:
> This way we can offer to callers the service of just sizing
> instructions, and we also can better guarantee not to raise the wrong
> fault due to not having read all relevant bytes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -129,8 +129,8 @@ static const opcode_desc_t opcode_table[
>      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>      ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, ImplicitOps,
>      /* 0xA0 - 0xA7 */
> -    ByteOp|DstEax|SrcImplicit|Mov, DstEax|SrcImplicit|Mov,
> -    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
> +    ByteOp|DstEax|SrcMem|Mov, DstEax|SrcMem|Mov,
> +    ByteOp|DstMem|SrcEax|Mov, DstMem|SrcEax|Mov,
>      ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
>      ByteOp|ImplicitOps, ImplicitOps,
>      /* 0xA8 - 0xAF */
> @@ -1602,6 +1602,45 @@ struct x86_emulate_state {
>  #define _regs (state->regs)
>  
>  static int
> +x86_decode_base(

What do you mean by decode_base here?

> +    struct x86_emulate_state *state,
> +    struct x86_emulate_ctxt *ctxt,
> +    const struct x86_emulate_ops *ops)
> +{
> +    int rc = X86EMUL_OKAY;
> +
> +    switch ( state->opcode )
> +    {
> +    case 0x9a: /* call (far, absolute) */
> +    case 0xea: /* jmp (far, absolute) */
> +        generate_exception_if(mode_64bit(), EXC_UD, -1);
> +
> +        imm1 = insn_fetch_bytes(op_bytes);
> +        imm2 = insn_fetch_type(uint16_t);
> +        break;
> +
> +    case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
> +    case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
> +        /* Source EA is not encoded via ModRM. */
> +        ea.mem.off = insn_fetch_bytes(ad_bytes);
> +        break;
> +
> +    case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
> +        if ( op_bytes == 8 ) /* Fetch more bytes to obtain imm64. */
> +            imm1 = ((uint32_t)imm1 |
> +                    ((uint64_t)insn_fetch_type(uint32_t) << 32));
> +        break;
> +
> +    case 0xc8: /* enter imm16,imm8 */
> +        imm2 = insn_fetch_type(uint8_t);
> +        break;
> +    }
> +
> + done:
> +    return rc;
> +}
> +
> +static int
>  x86_decode(
>      struct x86_emulate_state *state,
>      struct x86_emulate_ctxt *ctxt,
> @@ -1994,10 +2033,29 @@ x86_decode(
>      state->opcode = b;
>      state->desc = d;
>  
> +    switch ( ext )
> +    {
> +    case ext_none:
> +        rc = x86_decode_base(state, ctxt, ops);
> +        break;
> +
> +    case ext_0f:
> +    case ext_0f38:
> +        break;
> +
> +    default:
> +        ASSERT_UNREACHABLE();
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
>   done:
>      return rc;
>  }
>  
> +/* No insn fetching past this point. */
> +#undef insn_fetch_bytes
> +#undef insn_fetch_type
> +
>  int
>  x86_emulate(
>      struct x86_emulate_ctxt *ctxt,
> @@ -2560,6 +2618,8 @@ x86_emulate(
>      case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
>          generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1);
>      case 0x88 ... 0x8b: /* mov */
> +    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
> +    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
>          dst.val = src.val;
>          break;
>  
> @@ -2644,18 +2704,13 @@ x86_emulate(
>  
>      case 0x9a: /* call (far, absolute) */ {
>          struct segment_register reg;
> -        uint16_t sel;
> -        uint32_t eip;
>  
> -        generate_exception_if(mode_64bit(), EXC_UD, -1);
> +        ASSERT(!mode_64bit());

Are we going to strictly require that noone ever hand-crafts a
x86_emulate_state and hands it to x86_emulate()?

I would suggest leaving the generate_exception_if(mode_64bit(), EXC_UD,
-1); after the ASSERT() so even if we do end up in a wonky state, we
don't try to jump the guest to 0.

Similarly for jmp.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 03/17] x86emul: track only rIP in emulator state
  2016-09-08 13:09 ` [PATCH 03/17] " Jan Beulich
@ 2016-09-13 19:09   ` Andrew Cooper
  2016-09-14  9:58     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-13 19:09 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:09, Jan Beulich wrote:
> Now that all decoding happens in x86_decode() there's no need to keep
> the local registers copy in struct x86_emulate_state. Only rIP gets
> updated in the decode phase, so only that register needs tracking
> there. All other (read-only) registers can be read from the original
> structure (but sadly, due to it getting passed to decode_register(),
> the pointer can't be made point to "const" to make the compiler help
> ensure no modification happens).

I was going to suggest making a second helper and casting away
constness, but that also has problems with the mark_regs_dirty() call.

However, on further consideration...

> @@ -2061,6 +2064,8 @@ x86_emulate(
>      struct x86_emulate_ctxt *ctxt,
>      const struct x86_emulate_ops *ops)
>  {
> +    /* Shadow copy of register state. Committed on successful emulation. */
> +    struct cpu_user_regs _regs = *ctxt->regs;
>      struct x86_emulate_state state;
>      int rc;
>      uint8_t b, d;
> @@ -2074,10 +2079,21 @@ x86_emulate(
>      if ( rc != X86EMUL_OKAY)
>          return rc;
>  
> +    /* Sync rIP to post decode value. */
> +    _regs.eip = state.eip;
> +
>      b = state.opcode;
>      d = state.desc;
>  #define state (&state)
>  
> +    /* Re-vector ea's register pointer into our shadow registers. */
> +    if ( ea.type == OP_REG )
> +    {
> +        unsigned int offs = (void *)ea.reg - (void *)state->regs;
> +
> +        ea.reg = (void *)&_regs + offs;
> +    }
> +

This is some very hairy pointer arithmetic.

Why do we need to decode registers in x86_decode()?

We don't need to decode the GPRs to calculate the length of the
instruction.  If the displacement is stashed in x86_emulate_state, the
calculation of ea can be deferred until the start of x86_emulate(), and
no arithmetic like this would be necessary.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase
  2016-09-13 18:44   ` Andrew Cooper
@ 2016-09-14  9:55     ` Jan Beulich
  2016-09-23 14:48       ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-14  9:55 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 13.09.16 at 20:44, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:07, Jan Beulich wrote:
>> @@ -1602,6 +1602,45 @@ struct x86_emulate_state {
>>  #define _regs (state->regs)
>>  
>>  static int
>> +x86_decode_base(
> 
> What do you mean by decode_base here?

The base instruction set (no 0f or alike prefixes). Suggestions for
a better name welcome.

>> @@ -2644,18 +2704,13 @@ x86_emulate(
>>  
>>      case 0x9a: /* call (far, absolute) */ {
>>          struct segment_register reg;
>> -        uint16_t sel;
>> -        uint32_t eip;
>>  
>> -        generate_exception_if(mode_64bit(), EXC_UD, -1);
>> +        ASSERT(!mode_64bit());
> 
> Are we going to strictly require that noone ever hand-crafts a
> x86_emulate_state and hands it to x86_emulate()?

Absolutely - that's why its definition does not live in a header.

> I would suggest leaving the generate_exception_if(mode_64bit(), EXC_UD,
> -1); after the ASSERT() so even if we do end up in a wonky state, we
> don't try to jump the guest to 0.

That would look really strange to a reader, I think, and hence I'd
rather not do this if I can get the patch accepted without.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 03/17] x86emul: track only rIP in emulator state
  2016-09-13 19:09   ` Andrew Cooper
@ 2016-09-14  9:58     ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-14  9:58 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 13.09.16 at 21:09, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:09, Jan Beulich wrote:
>> Now that all decoding happens in x86_decode() there's no need to keep
>> the local registers copy in struct x86_emulate_state. Only rIP gets
>> updated in the decode phase, so only that register needs tracking
>> there. All other (read-only) registers can be read from the original
>> structure (but sadly, due to it getting passed to decode_register(),
>> the pointer can't be made point to "const" to make the compiler help
>> ensure no modification happens).
> 
> I was going to suggest making a second helper and casting away
> constness, but that also has problems with the mark_regs_dirty() call.
> 
> However, on further consideration...
> 
>> @@ -2061,6 +2064,8 @@ x86_emulate(
>>      struct x86_emulate_ctxt *ctxt,
>>      const struct x86_emulate_ops *ops)
>>  {
>> +    /* Shadow copy of register state. Committed on successful emulation. */
>> +    struct cpu_user_regs _regs = *ctxt->regs;
>>      struct x86_emulate_state state;
>>      int rc;
>>      uint8_t b, d;
>> @@ -2074,10 +2079,21 @@ x86_emulate(
>>      if ( rc != X86EMUL_OKAY)
>>          return rc;
>>  
>> +    /* Sync rIP to post decode value. */
>> +    _regs.eip = state.eip;
>> +
>>      b = state.opcode;
>>      d = state.desc;
>>  #define state (&state)
>>  
>> +    /* Re-vector ea's register pointer into our shadow registers. */
>> +    if ( ea.type == OP_REG )
>> +    {
>> +        unsigned int offs = (void *)ea.reg - (void *)state->regs;
>> +
>> +        ea.reg = (void *)&_regs + offs;
>> +    }
>> +
> 
> This is some very hairy pointer arithmetic.
> 
> Why do we need to decode registers in x86_decode()?
> 
> We don't need to decode the GPRs to calculate the length of the
> instruction.  If the displacement is stashed in x86_emulate_state, the
> calculation of ea can be deferred until the start of x86_emulate(), and
> no arithmetic like this would be necessary.

That's a good idea; I didn't really like this pointer arithmetic myself,
but didn't come to think of this (seemingly obvious) alternative.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-08 13:10 ` [PATCH 04/17] x86emul: complete decoding of two-byte instructions Jan Beulich
@ 2016-09-14 14:22   ` Andrew Cooper
  2016-09-14 15:05     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-14 14:22 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:10, Jan Beulich wrote:
> This way we can at least size (and e.g. skip) them if needed, and we
> also won't raise the wrong fault due to not having read all relevant
> bytes.

What faults are you referring to?  #UD vs #GP from hitting the %cs limit?

>
> This at once adds correct raising of #UD for the three "ud<n>" flavors
> (Intel names only "ud2", but AMD names all three of them in their
> opcode maps), as that may make a difference to callers compared to
> getting back X86EMUL_UNHANDLEABLE.

Definitely a good improvement.  I have been meaning to do this for a while.

Intel does references 0FB9 in a footnote in the opcode map, but I can't
see any mention of 0FFF at all.

>
> Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
> which have a ModRM like byte where only register forms are valid. I.e.
> we could also use SrcImmByte there, but ModRM is more likely to be
> correct for a hypothetical extension allowing non-register operations.

Won't the use of ModRM possibly cause us to read too much if it end up
with SIB and displacement encoding?  OTOH, do we really care?

>
> Note on opcode 0FB8: I think we're safe to ignore JMPE (which doesn't
> take a ModRM byte, but an immediate).

It took a while to find out what this instruction is.  Mind indicating
that it is Itanium-specific in the commit message?

POPCNT, the aliased instruction takes a full ModRM byte with no space to
distinguish.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-14 14:22   ` Andrew Cooper
@ 2016-09-14 15:05     ` Jan Beulich
  2016-09-23 16:34       ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-14 15:05 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 14.09.16 at 16:22, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:10, Jan Beulich wrote:
>> This way we can at least size (and e.g. skip) them if needed, and we
>> also won't raise the wrong fault due to not having read all relevant
>> bytes.
> 
> What faults are you referring to?  #UD vs #GP from hitting the %cs limit?

Or #PF.

>> This at once adds correct raising of #UD for the three "ud<n>" flavors
>> (Intel names only "ud2", but AMD names all three of them in their
>> opcode maps), as that may make a difference to callers compared to
>> getting back X86EMUL_UNHANDLEABLE.
> 
> Definitely a good improvement.  I have been meaning to do this for a while.
> 
> Intel does references 0FB9 in a footnote in the opcode map, but I can't
> see any mention of 0FFF at all.

Check AMD's.

>> Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
>> which have a ModRM like byte where only register forms are valid. I.e.
>> we could also use SrcImmByte there, but ModRM is more likely to be
>> correct for a hypothetical extension allowing non-register operations.
> 
> Won't the use of ModRM possibly cause us to read too much if it end up
> with SIB and displacement encoding?  OTOH, do we really care?

That's why I've added that paragraph: I'd be fine either way, but I
do think the intention is a ModRM byte. Which is then also in line with
these opcodes' uses in early 386 and 486 processors (xbts/ibts/
cmpxchg).

>> Note on opcode 0FB8: I think we're safe to ignore JMPE (which doesn't
>> take a ModRM byte, but an immediate).
> 
> It took a while to find out what this instruction is.  Mind indicating
> that it is Itanium-specific in the commit message?

Sure.

> POPCNT, the aliased instruction takes a full ModRM byte with no space to
> distinguish.

Well, distinguishing them is possible in principle, as by the time we
process bytes past the main opcode one we already know whether
an F3 prefix was present. I simply think it's not worth trying to do
so.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 05/17] x86emul: add XOP decoding
  2016-09-08 13:11 ` [PATCH 05/17] x86emul: add XOP decoding Jan Beulich
@ 2016-09-14 16:11   ` Andrew Cooper
  2016-09-14 16:21     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-14 16:11 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:11, Jan Beulich wrote:
> This way we can at least size (and e.g. skip) them if needed, and we
> also won't raise the wrong fault due to not having read all relevant
> bytes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -279,6 +279,12 @@ static const opcode_desc_t twobyte_table
>      ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
>  };
>  
> +static const opcode_desc_t xop_table[] = {
> +    DstReg|SrcImmByte|ModRM,
> +    DstReg|SrcMem|ModRM,
> +    DstReg|SrcImm|ModRM,
> +};
> +
>  #define REX_PREFIX 0x40
>  #define REX_B 0x01
>  #define REX_X 0x02
> @@ -1580,6 +1586,9 @@ struct x86_emulate_state {
>          ext_0f   = vex_0f,
>          ext_0f38 = vex_0f38,
>          ext_0f3a = vex_0f3a,
> +        ext_8f08 = 8,
> +        ext_8f09,
> +        ext_8f0a,

What is this = 8 for?  I presume you didn't slip it in accidentally, but
I still can't figure out why.

>      } ext;
>      uint8_t opcode;
>      uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
> @@ -1802,7 +1811,7 @@ x86_decode(
>          modrm = insn_fetch_type(uint8_t);
>          modrm_mod = (modrm & 0xc0) >> 6;
>  
> -        if ( !ext && ((b & ~1) == 0xc4) )
> +        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
>              switch ( def_ad_bytes )
>              {
>              default:
> @@ -1816,11 +1825,11 @@ x86_decode(
>                      break;
>                  /* fall through */
>              case 8:
> -                /* VEX */
> +                /* VEX / XOP */
>                  generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
>  
>                  vex.raw[0] = modrm;
> -                if ( b & 1 )
> +                if ( b == 0xc5 )
>                  {
>                      vex.raw[1] = modrm;
>                      vex.opcx = vex_0f;
> @@ -1848,18 +1857,30 @@ x86_decode(
>                      rex_prefix |= REX_R;
>  
>                  b = insn_fetch_type(uint8_t);
> -                switch ( ext = vex.opcx )
> +                ext = vex.opcx;
> +                if ( b != 0x8f )
> +                {
> +                    switch ( ext )
> +                    {
> +                    case vex_0f:
> +                        d = twobyte_table[b];
> +                        break;
> +                    case vex_0f38:
> +                        d = twobyte_table[0x38];
> +                        break;
> +                    case vex_0f3a:
> +                        d = twobyte_table[0x3a];
> +                        break;
> +                    default:
> +                        rc = X86EMUL_UNHANDLEABLE;
> +                        goto done;
> +                    }
> +                }
> +                else if ( ext < ext_8f08 +
> +                                sizeof(xop_table) / sizeof(*xop_table) )

ARRAY_SIZE() ?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 05/17] x86emul: add XOP decoding
  2016-09-14 16:11   ` Andrew Cooper
@ 2016-09-14 16:21     ` Jan Beulich
  2016-09-23 17:01       ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-14 16:21 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 14.09.16 at 18:11, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:11, Jan Beulich wrote:
>> @@ -1580,6 +1586,9 @@ struct x86_emulate_state {
>>          ext_0f   = vex_0f,
>>          ext_0f38 = vex_0f38,
>>          ext_0f3a = vex_0f3a,
>> +        ext_8f08 = 8,
>> +        ext_8f09,
>> +        ext_8f0a,
> 
> What is this = 8 for?  I presume you didn't slip it in accidentally, but
> I still can't figure out why.

So I can use the value directly from vex.opcx, without further
adjustment.

>> @@ -1848,18 +1857,30 @@ x86_decode(
>>                      rex_prefix |= REX_R;
>>  
>>                  b = insn_fetch_type(uint8_t);
>> -                switch ( ext = vex.opcx )
>> +                ext = vex.opcx;
>> +                if ( b != 0x8f )
>> +                {
>> +                    switch ( ext )
>> +                    {
>> +                    case vex_0f:
>> +                        d = twobyte_table[b];
>> +                        break;
>> +                    case vex_0f38:
>> +                        d = twobyte_table[0x38];
>> +                        break;
>> +                    case vex_0f3a:
>> +                        d = twobyte_table[0x3a];
>> +                        break;
>> +                    default:
>> +                        rc = X86EMUL_UNHANDLEABLE;
>> +                        goto done;
>> +                    }
>> +                }
>> +                else if ( ext < ext_8f08 +
>> +                                sizeof(xop_table) / sizeof(*xop_table) )
> 
> ARRAY_SIZE() ?

If we want to add another helper #define to the test code, yes. It
being a single instance, that addition didn't seem worth it to me.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 06/17] x86emul: add EVEX decoding
  2016-09-08 13:12 ` [PATCH 06/17] x86emul: add EVEX decoding Jan Beulich
@ 2016-09-14 17:05   ` Andrew Cooper
  2016-09-15  6:26     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-14 17:05 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:12, Jan Beulich wrote:
> This way we can at least size (and e.g. skip) them if needed, and we
> also won't raise the wrong fault due to not having read all relevant
> bytes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TBD: I'm kind of undecided whether to right away propagate evex.R into
>      modrm_reg (and then also deal with the new meaning of evex.x for
>      modrm_rm). Since that doesn't affect GPRs (and the extra bits
>      would need masking off when accessing GPRs) I've left this out for
>      now.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -336,6 +336,27 @@ union vex {
>          ptr[1] = rex | REX_PREFIX; \
>  } while (0)
>  
> +union evex {
> +    uint8_t raw[3];
> +    struct {
> +        uint8_t opcx:2;
> +        uint8_t :2;

Is this legal syntax?  I am guessing it compiles for you, so is it
perhaps a GCCism?

> +        uint8_t R:1;
> +        uint8_t b:1;
> +        uint8_t x:1;
> +        uint8_t r:1;
> +        uint8_t pfx:2;
> +        uint8_t evex:1;
> +        uint8_t reg:4;
> +        uint8_t w:1;
> +        uint8_t opmsk:3;
> +        uint8_t RX:1;
> +        uint8_t bcst:1;
> +        uint8_t lr:2;
> +        uint8_t z:1;
> +    };
> +};
> +
>  #define rep_prefix()   (vex.pfx >= vex_f3)
>  #define repe_prefix()  (vex.pfx == vex_f3)
>  #define repne_prefix() (vex.pfx == vex_f2)
> @@ -1596,6 +1617,7 @@ struct x86_emulate_state {
>      bool lock_prefix;
>      opcode_desc_t desc;
>      union vex vex;
> +    union evex evex;
>      int override_seg;
>  
>      /*
> @@ -1623,6 +1645,7 @@ struct x86_emulate_state {
>  #define rex_prefix (state->rex_prefix)
>  #define lock_prefix (state->lock_prefix)
>  #define vex (state->vex)
> +#define evex (state->evex)
>  #define override_seg (state->override_seg)
>  #define ea (state->ea)
>  
> @@ -1811,7 +1834,8 @@ x86_decode(
>          modrm = insn_fetch_type(uint8_t);
>          modrm_mod = (modrm & 0xc0) >> 6;
>  
> -        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
> +        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18)) ||
> +                      b == 0x62) )
>              switch ( def_ad_bytes )
>              {
>              default:
> @@ -1825,7 +1849,7 @@ x86_decode(
>                      break;
>                  /* fall through */
>              case 8:
> -                /* VEX / XOP */
> +                /* VEX / XOP / EVEX */
>                  generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
>  
>                  vex.raw[0] = modrm;
> @@ -1852,6 +1876,14 @@ x86_decode(
>                              op_bytes = 8;
>                          }
>                      }
> +                    if ( b == 0x62 )
> +                    {
> +                        evex.raw[0] = vex.raw[0];
> +                        evex.raw[1] = vex.raw[1];
> +                        evex.raw[2] = insn_fetch_type(uint8_t);
> +
> +                        vex.opcx = evex.opcx;

What is the meaning of opcx? The manuals list these as the mm fields.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 07/17] x86emul: move x86_execute() common epilogue code
  2016-09-08 13:13 ` [PATCH 07/17] x86emul: move x86_execute() common epilogue code Jan Beulich
  2016-09-08 13:28   ` Jan Beulich
@ 2016-09-14 17:13   ` Andrew Cooper
  1 sibling, 0 replies; 50+ messages in thread
From: Andrew Cooper @ 2016-09-14 17:13 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:13, Jan Beulich wrote:
> Only code movement, no functional change.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 08/17] x86emul: generate and make use of canonical opcode representation
  2016-09-08 13:14 ` [PATCH 08/17] x86emul: generate and make use of canonical opcode representation Jan Beulich
@ 2016-09-14 17:30   ` Andrew Cooper
  2016-09-15  6:43     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-14 17:30 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 08/09/16 14:14, Jan Beulich wrote:

"of a canonical opcode representation".

You appear to be inventing your own here, but it isn't the only
canonical form you could represent x86 opcodes with.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -415,12 +415,15 @@ struct x86_emulate_ctxt
>      /* Stack pointer width in bits (16, 32 or 64). */
>      unsigned int sp_size;
>  
> -    /* Set this if writes may have side effects. */
> -    uint8_t force_writeback;
> +    /* Canonical opcode (see below). */
> +    unsigned int opcode;
>  
>      /* Software event injection support. */
>      enum x86_swint_emulation swint_emulate;
>  
> +    /* Set this if writes may have side effects. */
> +    uint8_t force_writeback;
> +
>      /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */
>      union {
>          struct {
> @@ -435,6 +438,51 @@ struct x86_emulate_ctxt
>      void *data;
>  };
>  
> +/*
> + * This encodes the opcode extension in a "natural" way:

I am not sure what you mean by natural way here.  All you seem to mean
is that you are encoding instructions with the following method

> + *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
> + *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
> + *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
> + *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
> + *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
> + *  0x8f0axxxx for 8f/a-prefixed XOP opcodes
> + * Hence no separate #define-s get added.

Please also describe what the xxxx fields mean.  Looking below, I guess
that the bottom byte is the opcode itself, and some bits of the 2nd byte
are legacy prefixes?

> + */
> +#define X86EMUL_OPC_EXT_MASK         0xffff0000
> +#define X86EMUL_OPC(ext, byte)       ((byte) | \
> +                                      MASK_INSR((ext), X86EMUL_OPC_EXT_MASK))

I would highly suggest using ((byte) & 0xff).  In the case that a change
is slightly out of range, this should cause a compiler error (duplicate
case statement) rather than a very subtle bug.

> +/*
> + * This includes the 0x66, 0xF3, and 0xF2 prefixes when used to alter
> + * functionality instead of just insn attributes, as well as VEX/EVEX:
> + */
> +#define X86EMUL_OPC_MASK             (0x000000ff | X86EMUL_OPC_PFX_MASK | \
> +                                     X86EMUL_OPC_KIND_MASK)

The definition should presumably live after introducing the PFX_MASK and
KIND_MASK ?

> +
> +#define X86EMUL_OPC_PFX_MASK         0x00000300
> +# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
> +# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
> +# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)

The PFX mask is moderately obvious from here, but a sentence describing
what is legitimate to add in the future wouldn't go amiss.

> +
> +#define X86EMUL_OPC_KIND_MASK        0x00003000
> +#define X86EMUL_OPC_VEX_             0x00001000

OTOH, I am rather more confused about what is eligible for inclusion
into "kind".  Also, what does a kind of 0 indicate?

> +# define X86EMUL_OPC_VEX(ext, byte) \
> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
> +# define X86EMUL_OPC_VEX_66(ext, byte) \
> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
> +# define X86EMUL_OPC_VEX_F3(ext, byte) \
> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
> +# define X86EMUL_OPC_VEX_F2(ext, byte) \
> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
> +#define X86EMUL_OPC_EVEX_            0x00002000
> +# define X86EMUL_OPC_EVEX(ext, byte) \
> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
> +# define X86EMUL_OPC_EVEX_66(ext, byte) \
> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
> +# define X86EMUL_OPC_EVEX_F3(ext, byte) \
> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
> +# define X86EMUL_OPC_EVEX_F2(ext, byte) \
> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)

Why do we go to the effort of spelling out the individual VEX/EVEX
possibilities, but not the XOP ones?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/17] SVM: use generic instruction decoding
  2016-09-08 13:14 ` [PATCH 09/17] SVM: use generic instruction decoding Jan Beulich
@ 2016-09-14 17:56   ` Andrew Cooper
  2016-09-15  6:55     ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-14 17:56 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Boris Ostrovsky, Suravee Suthikulpanit

On 08/09/16 14:14, Jan Beulich wrote:
> @@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
>      return vmcb->nextrip - vmcb->rip;
>  }
>  
> -/* First byte: Length. Following bytes: Opcode bytes. */
> -#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
> -MAKE_INSTR(INVD,   2, 0x0f, 0x08);
> -MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
> -MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
> -MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
> -MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
> -MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
> -MAKE_INSTR(HLT,    1, 0xf4);
> -MAKE_INSTR(INT3,   1, 0xcc);
> -MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
> -MAKE_INSTR(PAUSE,  1, 0x90);
> -MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
> -MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
> -MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
> -MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
> -MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
> -MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
> -MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
> -
> -static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
> -{
> -    [INSTR_INVD]   = OPCODE_INVD,
> -    [INSTR_WBINVD] = OPCODE_WBINVD,
> -    [INSTR_CPUID]  = OPCODE_CPUID,
> -    [INSTR_RDMSR]  = OPCODE_RDMSR,
> -    [INSTR_WRMSR]  = OPCODE_WRMSR,
> -    [INSTR_VMCALL] = OPCODE_VMCALL,
> -    [INSTR_HLT]    = OPCODE_HLT,
> -    [INSTR_INT3]   = OPCODE_INT3,
> -    [INSTR_RDTSC]  = OPCODE_RDTSC,
> -    [INSTR_PAUSE]  = OPCODE_PAUSE,
> -    [INSTR_XSETBV] = OPCODE_XSETBV,
> -    [INSTR_VMRUN]  = OPCODE_VMRUN,
> -    [INSTR_VMLOAD] = OPCODE_VMLOAD,
> -    [INSTR_VMSAVE] = OPCODE_VMSAVE,
> -    [INSTR_STGI]   = OPCODE_STGI,
> -    [INSTR_CLGI]   = OPCODE_CLGI,
> -    [INSTR_INVLPGA] = OPCODE_INVLPGA,
> +static const struct {
> +    unsigned int opcode;
> +    struct {
> +        unsigned int rm:3;
> +        unsigned int reg:3;
> +        unsigned int mod:2;
> +#define MODRM(mod, reg, rm) { rm, reg, mod }
> +    } modrm;
> +} const opc_tab[INSTR_MAX_COUNT] = {
> +    [INSTR_PAUSE]  = { X86EMUL_OPC_F3(0, 0x90) },
> +    [INSTR_INT3]   = { X86EMUL_OPC(   0, 0xcc) },
> +    [INSTR_HLT]    = { X86EMUL_OPC(   0, 0xf4) },
> +    [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
> +    [INSTR_VMRUN]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
> +    [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
> +    [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
> +    [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
> +    [INSTR_STGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
> +    [INSTR_CLGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
> +    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
> +    [INSTR_INVD]   = { X86EMUL_OPC(0x0f, 0x08) },
> +    [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) },
> +    [INSTR_WRMSR]  = { X86EMUL_OPC(0x0f, 0x30) },
> +    [INSTR_RDTSC]  = { X86EMUL_OPC(0x0f, 0x31) },
> +    [INSTR_RDMSR]  = { X86EMUL_OPC(0x0f, 0x32) },
> +    [INSTR_CPUID]  = { X86EMUL_OPC(0x0f, 0xa2) },
>  };
>  
> -static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
> -                    unsigned long addr, unsigned int len)
> -{
> -    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
> -
> -    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
> -    {
> -    case HVMCOPY_okay:
> -        break;
> -    case HVMCOPY_bad_gva_to_gfn:
> -        /* OK just to give up; we'll have injected #PF already */
> -        return 0;
> -    default:
> -        /* Not OK: fetches from non-RAM pages are not supportable. */
> -        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
> -                 vmcb->rip, addr);
> -        hvm_inject_hw_exception(TRAP_gp_fault, 0);
> -        return 0;
> -    }
> -    return 1;
> -}
> -
>  int __get_instruction_length_from_list(struct vcpu *v,
>          const enum instruction_index *list, unsigned int list_count)
>  {
>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> -    unsigned int i, j, inst_len = 0;
> -    enum instruction_index instr = 0;
> -    u8 buf[MAX_INST_LEN];
> -    const u8 *opcode = NULL;
> -    unsigned long fetch_addr, fetch_limit;
> -    unsigned int fetch_len, max_len;
> +    struct hvm_emulate_ctxt ctxt;
> +    struct x86_emulate_state *state;
> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
> +    int modrm_mod;
>  
> +#ifdef NDEBUG

Presumably this is just for your testing?

>      if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
>          return inst_len;
>  
>      if ( vmcb->exitcode == VMEXIT_IOIO )
>          return vmcb->exitinfo2 - vmcb->rip;
> +#endif
>  
> -    /* Fetch up to the next page break; we'll fetch from the next page
> -     * later if we have to. */
> -    fetch_addr = svm_rip2pointer(v, &fetch_limit);
> -    if ( vmcb->rip > fetch_limit )
> -        return 0;
> -    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
> -    fetch_len = min_t(unsigned int, max_len,
> -                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
> -    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
> +    ASSERT(v == current);
> +    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
> +    hvm_emulate_init(&ctxt, NULL, 0);
> +    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
> +    if ( IS_ERR_OR_NULL(state) )
>          return 0;
>  
> -    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
> -    {
> -        inst_len++;
> -        if ( inst_len >= fetch_len )
> -        {
> -            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
> -                        max_len - fetch_len) )
> -                return 0;
> -            fetch_len = max_len;
> -        }
> +    inst_len = x86_insn_length(state, &ctxt.ctxt);
> +    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
> +    x86_emulate_free_state(state);

From an API point of view, it is weird to have x86_emulate_free_state()
without a matching allocation function.  Perhaps that is just me.

However, the x86_insn_modrm() API is definitely more weird.  Wouldn't it
be more natural to take optional pointers for the mod, rm and reg parts
individually?


> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -382,7 +382,7 @@ struct operand {
>      } mem;
>  };
>  #ifdef __x86_64__
> -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
> +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
>  #else
>  #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>  #endif

Given that these are now used for general pointer poisoning, they should
be renamed.  There are only 3 instances.

> @@ -1631,6 +1631,10 @@ struct x86_emulate_state {
>  
>      unsigned long eip;
>      struct cpu_user_regs *regs;
> +
> +#ifndef NDEBUG
> +    void *caller;
> +#endif

Perhaps worth a comment here?  Its purpose is rather opaque.

>  };
>  
>  /* Helper definitions. */
> @@ -1658,6 +1662,11 @@ x86_decode_base(
>  
>      switch ( ctxt->opcode )
>      {
> +    case 0x90: /* nop / pause */
> +        if ( repe_prefix() )
> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
> +        break;

Why is it necessary to special case the rep prefix handling in this case?

> +int
> +x86_insn_modrm(const struct x86_emulate_state *state,
> +               unsigned int *rm, unsigned int *reg)
> +{
> +    check_state(state);
> +
> +    if ( !(state->desc & ModRM) )
> +        return -EINVAL;
> +
> +    if ( rm )
> +        *rm = state->modrm_rm;
> +    if ( reg )
> +        *reg = state->modrm_reg;
> +
> +    return state->modrm_mod;
> +}
> +
> +unsigned int
> +x86_insn_length(const struct x86_emulate_state *state,
> +                const struct x86_emulate_ctxt *ctxt)
> +{
> +    check_state(state);
> +
> +    return state->eip - ctxt->regs->eip;

Is it worth stashing a starting eip?  This calculation will go wrong
after the emulated state has been committed.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 06/17] x86emul: add EVEX decoding
  2016-09-14 17:05   ` Andrew Cooper
@ 2016-09-15  6:26     ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-15  6:26 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 14.09.16 at 19:05, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:12, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -336,6 +336,27 @@ union vex {
>>          ptr[1] = rex | REX_PREFIX; \
>>  } while (0)
>>  
>> +union evex {
>> +    uint8_t raw[3];
>> +    struct {
>> +        uint8_t opcx:2;
>> +        uint8_t :2;
> 
> Is this legal syntax?  I am guessing it compiles for you, so is it
> perhaps a GCCism?

Unnamed bitfields are standard C afaik.

>> @@ -1852,6 +1876,14 @@ x86_decode(
>>                              op_bytes = 8;
>>                          }
>>                      }
>> +                    if ( b == 0x62 )
>> +                    {
>> +                        evex.raw[0] = vex.raw[0];
>> +                        evex.raw[1] = vex.raw[1];
>> +                        evex.raw[2] = insn_fetch_type(uint8_t);
>> +
>> +                        vex.opcx = evex.opcx;
> 
> What is the meaning of opcx? The manuals list these as the mm fields.

Well, we're already using opcx instead of mmmmm for VEX, so it seems
natural to also do so for EVEX. I'm in particular of the opinion that field
names like mmmmm or vvvv are rather meaningless.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 08/17] x86emul: generate and make use of canonical opcode representation
  2016-09-14 17:30   ` Andrew Cooper
@ 2016-09-15  6:43     ` Jan Beulich
  2016-09-27 14:03       ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-15  6:43 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 14.09.16 at 19:30, <andrew.cooper3@citrix.com> wrote:
>> @@ -435,6 +438,51 @@ struct x86_emulate_ctxt
>>      void *data;
>>  };
>>  
>> +/*
>> + * This encodes the opcode extension in a "natural" way:
> 
> I am not sure what you mean by natural way here.  All you seem to mean
> is that you are encoding instructions with the following method

Hence the quotes. Do you have a suggestion for a better word?

>> + *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
>> + *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
>> + *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
>> + *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
>> + *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
>> + *  0x8f0axxxx for 8f/a-prefixed XOP opcodes
>> + * Hence no separate #define-s get added.
> 
> Please also describe what the xxxx fields mean.  Looking below, I guess
> that the bottom byte is the opcode itself, and some bits of the 2nd byte
> are legacy prefixes?

Yes. Comment extended.

>> + */
>> +#define X86EMUL_OPC_EXT_MASK         0xffff0000
>> +#define X86EMUL_OPC(ext, byte)       ((byte) | \
>> +                                      MASK_INSR((ext), 
> X86EMUL_OPC_EXT_MASK))
> 
> I would highly suggest using ((byte) & 0xff).  In the case that a change
> is slightly out of range, this should cause a compiler error (duplicate
> case statement) rather than a very subtle bug.

Well, okay.

>> +/*
>> + * This includes the 0x66, 0xF3, and 0xF2 prefixes when used to alter
>> + * functionality instead of just insn attributes, as well as VEX/EVEX:
>> + */
>> +#define X86EMUL_OPC_MASK             (0x000000ff | X86EMUL_OPC_PFX_MASK | \
>> +                                     X86EMUL_OPC_KIND_MASK)
> 
> The definition should presumably live after introducing the PFX_MASK and
> KIND_MASK ?

I would prefer it to live closer to X86EMUL_OPC_EXT_MASK.

>> +#define X86EMUL_OPC_PFX_MASK         0x00000300
>> +# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
>> +# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
>> +# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
> 
> The PFX mask is moderately obvious from here, but a sentence describing
> what is legitimate to add in the future wouldn't go amiss.

I don't understand the "what is legitimate to add in the future"
part: Nothing should be added to this set.

>> +
>> +#define X86EMUL_OPC_KIND_MASK        0x00003000
>> +#define X86EMUL_OPC_VEX_             0x00001000
> 
> OTOH, I am rather more confused about what is eligible for inclusion
> into "kind".  Also, what does a kind of 0 indicate?

VEX, XOP, and EVEX are the valid non-zero kinds. Zero (I would
say obviously) means neither of those three.

>> +# define X86EMUL_OPC_VEX(ext, byte) \
>> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
>> +# define X86EMUL_OPC_VEX_66(ext, byte) \
>> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
>> +# define X86EMUL_OPC_VEX_F3(ext, byte) \
>> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
>> +# define X86EMUL_OPC_VEX_F2(ext, byte) \
>> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
>> +#define X86EMUL_OPC_EVEX_            0x00002000
>> +# define X86EMUL_OPC_EVEX(ext, byte) \
>> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
>> +# define X86EMUL_OPC_EVEX_66(ext, byte) \
>> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
>> +# define X86EMUL_OPC_EVEX_F3(ext, byte) \
>> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
>> +# define X86EMUL_OPC_EVEX_F2(ext, byte) \
>> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
> 
> Why do we go to the effort of spelling out the individual VEX/EVEX
> possibilities, but not the XOP ones?

Because I need some of them right away, but we currently don't
emulate any XOP insns. If you feel strongly about it, I surely can
add XOP ones.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/17] SVM: use generic instruction decoding
  2016-09-14 17:56   ` Andrew Cooper
@ 2016-09-15  6:55     ` Jan Beulich
  2016-09-27 13:42       ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-15  6:55 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Boris Ostrovsky, Suravee Suthikulpanit

>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:14, Jan Beulich wrote:
>>  int __get_instruction_length_from_list(struct vcpu *v,
>>          const enum instruction_index *list, unsigned int list_count)
>>  {
>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>> -    unsigned int i, j, inst_len = 0;
>> -    enum instruction_index instr = 0;
>> -    u8 buf[MAX_INST_LEN];
>> -    const u8 *opcode = NULL;
>> -    unsigned long fetch_addr, fetch_limit;
>> -    unsigned int fetch_len, max_len;
>> +    struct hvm_emulate_ctxt ctxt;
>> +    struct x86_emulate_state *state;
>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>> +    int modrm_mod;
>>  
>> +#ifdef NDEBUG
> 
> Presumably this is just for your testing?

No, I actually meant it to stay that way. Along the lines of the extra
debugging code we have in map_domain_page().

>>      if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
>>          return inst_len;
>>  
>>      if ( vmcb->exitcode == VMEXIT_IOIO )
>>          return vmcb->exitinfo2 - vmcb->rip;
>> +#endif
>>  
>> -    /* Fetch up to the next page break; we'll fetch from the next page
>> -     * later if we have to. */
>> -    fetch_addr = svm_rip2pointer(v, &fetch_limit);
>> -    if ( vmcb->rip > fetch_limit )
>> -        return 0;
>> -    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
>> -    fetch_len = min_t(unsigned int, max_len,
>> -                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
>> -    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
>> +    ASSERT(v == current);
>> +    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
>> +    hvm_emulate_init(&ctxt, NULL, 0);
>> +    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
>> +    if ( IS_ERR_OR_NULL(state) )
>>          return 0;
>>  
>> -    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
>> -    {
>> -        inst_len++;
>> -        if ( inst_len >= fetch_len )
>> -        {
>> -            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
>> -                        max_len - fetch_len) )
>> -                return 0;
>> -            fetch_len = max_len;
>> -        }
>> +    inst_len = x86_insn_length(state, &ctxt.ctxt);
>> +    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
>> +    x86_emulate_free_state(state);
> 
> From an API point of view, it is weird to have x86_emulate_free_state()
> without a matching allocation function.  Perhaps that is just me.

With x86_decode_insn() returning the state, that to me _is_ the
allocation function.

> However, the x86_insn_modrm() API is definitely more weird.  Wouldn't it
> be more natural to take optional pointers for the mod, rm and reg parts
> individually?

I could change it to that, but I did it this way because without mod
at least rm is meaningless. Or said differently, I can't really see there
being a caller not caring about mod.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -382,7 +382,7 @@ struct operand {
>>      } mem;
>>  };
>>  #ifdef __x86_64__
>> -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
>> +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
>>  #else
>>  #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>>  #endif
> 
> Given that these are now used for general pointer poisoning, they should
> be renamed.  There are only 3 instances.

Okay. I'll make the PTR_POISON then.

>> @@ -1658,6 +1662,11 @@ x86_decode_base(
>>  
>>      switch ( ctxt->opcode )
>>      {
>> +    case 0x90: /* nop / pause */
>> +        if ( repe_prefix() )
>> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
>> +        break;
> 
> Why is it necessary to special case the rep prefix handling in this case?

Because SVM's pause intercept should not mistakenly also accept a
plain NOP.

>> +unsigned int
>> +x86_insn_length(const struct x86_emulate_state *state,
>> +                const struct x86_emulate_ctxt *ctxt)
>> +{
>> +    check_state(state);
>> +
>> +    return state->eip - ctxt->regs->eip;
> 
> Is it worth stashing a starting eip?  This calculation will go wrong
> after the emulated state has been committed.

This function (taking a state parameter) can't be called by users of
x86_emulate(), and I don't think we need to cater for callers
committing state themselves - they should clearly use the result of
this function for what to commit in the first place.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase
  2016-09-14  9:55     ` Jan Beulich
@ 2016-09-23 14:48       ` Andrew Cooper
  2016-09-23 15:04         ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-23 14:48 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 14/09/16 10:55, Jan Beulich wrote:
>>>> On 13.09.16 at 20:44, <andrew.cooper3@citrix.com> wrote:
>> On 08/09/16 14:07, Jan Beulich wrote:
>>> @@ -1602,6 +1602,45 @@ struct x86_emulate_state {
>>>   #define _regs (state->regs)
>>>   
>>>   static int
>>> +x86_decode_base(
>> What do you mean by decode_base here?
> The base instruction set (no 0f or alike prefixes). Suggestions for
> a better name welcome.

x86_decode_onebyte() to match the table of opcodes it is further decoding.

>
>>> @@ -2644,18 +2704,13 @@ x86_emulate(
>>>   
>>>       case 0x9a: /* call (far, absolute) */ {
>>>           struct segment_register reg;
>>> -        uint16_t sel;
>>> -        uint32_t eip;
>>>   
>>> -        generate_exception_if(mode_64bit(), EXC_UD, -1);
>>> +        ASSERT(!mode_64bit());
>> Are we going to strictly require that noone ever hand-crafts a
>> x86_emulate_state and hands it to x86_emulate()?
> Absolutely - that's why its definition does not live in a header.

Ok.

>
>> I would suggest leaving the generate_exception_if(mode_64bit(), EXC_UD,
>> -1); after the ASSERT() so even if we do end up in a wonky state, we
>> don't try to jump the guest to 0.
> That would look really strange to a reader, I think, and hence I'd
> rather not do this if I can get the patch accepted without.

It is conceptually no different to

default:
ASSERT_UNREACHABLE();
return;

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase
  2016-09-23 14:48       ` Andrew Cooper
@ 2016-09-23 15:04         ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-23 15:04 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 23.09.16 at 16:48, <andrew.cooper3@citrix.com> wrote:
> On 14/09/16 10:55, Jan Beulich wrote:
>>>>> On 13.09.16 at 20:44, <andrew.cooper3@citrix.com> wrote:
>>> I would suggest leaving the generate_exception_if(mode_64bit(), EXC_UD,
>>> -1); after the ASSERT() so even if we do end up in a wonky state, we
>>> don't try to jump the guest to 0.
>> That would look really strange to a reader, I think, and hence I'd
>> rather not do this if I can get the patch accepted without.
> 
> It is conceptually no different to
> 
> default:
> ASSERT_UNREACHABLE();
> return;

Except that
- we don't always follow ASSERT_UNREACHABLE() with return (or
  whatever else),
- here we don't risk doing ourselves or the guest anything bad: We'll
  just produce undefined behavior, which is to be expected if we are
  in a "wonky state".

If leaving the exception generation in is the only way to get this
ack-ed, I'll do so, but I'd prefer if I wasn't required to.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-14 15:05     ` Jan Beulich
@ 2016-09-23 16:34       ` Andrew Cooper
  2016-09-26  7:34         ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-23 16:34 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 14/09/16 16:05, Jan Beulich wrote:
>>>> On 14.09.16 at 16:22, <andrew.cooper3@citrix.com> wrote:
>> On 08/09/16 14:10, Jan Beulich wrote:
>>> This way we can at least size (and e.g. skip) them if needed, and we
>>> also won't raise the wrong fault due to not having read all relevant
>>> bytes.
>> What faults are you referring to?  #UD vs #GP from hitting the %cs limit?
> Or #PF.
>
>>> This at once adds correct raising of #UD for the three "ud<n>" flavors
>>> (Intel names only "ud2", but AMD names all three of them in their
>>> opcode maps), as that may make a difference to callers compared to
>>> getting back X86EMUL_UNHANDLEABLE.
>> Definitely a good improvement.  I have been meaning to do this for a while.
>>
>> Intel does references 0FB9 in a footnote in the opcode map, but I can't
>> see any mention of 0FFF at all.
> Check AMD's.
>
>>> Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
>>> which have a ModRM like byte where only register forms are valid. I.e.
>>> we could also use SrcImmByte there, but ModRM is more likely to be
>>> correct for a hypothetical extension allowing non-register operations.
>> Won't the use of ModRM possibly cause us to read too much if it end up
>> with SIB and displacement encoding?  OTOH, do we really care?
> That's why I've added that paragraph: I'd be fine either way, but I
> do think the intention is a ModRM byte. Which is then also in line with
> these opcodes' uses in early 386 and 486 processors (xbts/ibts/
> cmpxchg).
>
>>> Note on opcode 0FB8: I think we're safe to ignore JMPE (which doesn't
>>> take a ModRM byte, but an immediate).
>> It took a while to find out what this instruction is.  Mind indicating
>> that it is Itanium-specific in the commit message?
> Sure.
>
>> POPCNT, the aliased instruction takes a full ModRM byte with no space to
>> distinguish.
> Well, distinguishing them is possible in principle, as by the time we
> process bytes past the main opcode one we already know whether
> an F3 prefix was present. I simply think it's not worth trying to do
> so.

It would be helpful if you listed all of the decoding modified.

 From the looks of things, the instructions changed are:

LAR, LSL, CLTS, UD2, FEMMS, 3DNow!, a bunch of SSE instructions, RDPMC, 
GETSEC, more SSE/MMX, RSM, POPCNT, UD1, yet more SSE,


A couple of other misc points:

What is the point of having 0F3A specified with 
DstReg|SrcImmByte|ModRM?  Being a prefix, it shouldn't be treated like a 
plain operation.

0F6F was previously ImplicitOps|ModRM, but looks like it should be ModRM 
like the rest of 0F6x.  0F7F, 0FC7 and 0FE7 similarly.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 05/17] x86emul: add XOP decoding
  2016-09-14 16:21     ` Jan Beulich
@ 2016-09-23 17:01       ` Andrew Cooper
  0 siblings, 0 replies; 50+ messages in thread
From: Andrew Cooper @ 2016-09-23 17:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 14/09/16 17:21, Jan Beulich wrote:
>>>> On 14.09.16 at 18:11, <andrew.cooper3@citrix.com> wrote:
>> On 08/09/16 14:11, Jan Beulich wrote:
>>> @@ -1580,6 +1586,9 @@ struct x86_emulate_state {
>>>           ext_0f   = vex_0f,
>>>           ext_0f38 = vex_0f38,
>>>           ext_0f3a = vex_0f3a,
>>> +        ext_8f08 = 8,
>>> +        ext_8f09,
>>> +        ext_8f0a,
>> What is this = 8 for?  I presume you didn't slip it in accidentally, but
>> I still can't figure out why.
> So I can use the value directly from vex.opcx, without further
> adjustment.

Ok, but please leave a comment to that effect.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-23 16:34       ` Andrew Cooper
@ 2016-09-26  7:34         ` Jan Beulich
  2016-09-27 13:28           ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-26  7:34 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 23.09.16 at 18:34, <andrew.cooper3@citrix.com> wrote:
> It would be helpful if you listed all of the decoding modified.
> 
>  From the looks of things, the instructions changed are:

I don't see the point: If any of them got proper emulation added,
I'd agree. But with the purpose of the patch being to simply add
correct decoding for _all_ instructions in this group, it is quite
obvious that everything gets modified which so far didn't have
sufficient information for decoding. What exactly those
instructions do becomes of interest once we add actual emulation.

> A couple of other misc points:
> 
> What is the point of having 0F3A specified with 
> DstReg|SrcImmByte|ModRM?  Being a prefix, it shouldn't be treated like a 
> plain operation.

You can view it either way, and for our purposes it is clearly easier
this way: The static tables are really mainly decoding helpers, and
all three groups (0F0F, 0F38, and 0F3A) have the nice property
that their operands are sufficiently uniform across the actual
opcodes. Hence the static tables better treat them as individual
opcodes (or else we'd have to introduce further tables with - for
each one of them - all entries identical), while actual emulation
(once added) would of course need to distinguish the various
operations.

> 0F6F was previously ImplicitOps|ModRM, but looks like it should be ModRM 
> like the rest of 0F6x.  0F7F, 0FC7 and 0FE7 similarly.

Why? As mentioned elsewhere I think the (otherwise benign)
ImplicitOps (as well as the individual DstImplicit and SrcImplicit)
serve as documentation: Opcodes we actually handle have them
specified, whereas opcodes getting decoded but not emulated
don't. See the MOVQ and MOVD patches in the other series, which
add ImplicitOps to the table entries they add emulation for.

(The one corner case here would be operations without any
operands, but that's only the two forms of NOP, and I think we
can accept them to not fit this model.)

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-26  7:34         ` Jan Beulich
@ 2016-09-27 13:28           ` Andrew Cooper
  2016-09-27 13:51             ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-27 13:28 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 26/09/16 08:34, Jan Beulich wrote:
>
>> 0F6F was previously ImplicitOps|ModRM, but looks like it should be ModRM 
>> like the rest of 0F6x.  0F7F, 0FC7 and 0FE7 similarly.
> Why? As mentioned elsewhere I think the (otherwise benign)
> ImplicitOps (as well as the individual DstImplicit and SrcImplicit)
> serve as documentation: Opcodes we actually handle have them
> specified, whereas opcodes getting decoded but not emulated
> don't. See the MOVQ and MOVD patches in the other series, which
> add ImplicitOps to the table entries they add emulation for.

By that argument, any instruction we have an emulation for should gain
ImplicitOps.

As it has the value 0, I only find that it further confuses an already
complicated piece of logic, as reading the decode gives the false
impression that something is different.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/17] SVM: use generic instruction decoding
  2016-09-15  6:55     ` Jan Beulich
@ 2016-09-27 13:42       ` Andrew Cooper
  2016-09-27 13:56         ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-27 13:42 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Boris Ostrovsky, Suravee Suthikulpanit

On 15/09/16 07:55, Jan Beulich wrote:
>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
>> On 08/09/16 14:14, Jan Beulich wrote:
>>>  int __get_instruction_length_from_list(struct vcpu *v,
>>>          const enum instruction_index *list, unsigned int list_count)
>>>  {
>>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>>> -    unsigned int i, j, inst_len = 0;
>>> -    enum instruction_index instr = 0;
>>> -    u8 buf[MAX_INST_LEN];
>>> -    const u8 *opcode = NULL;
>>> -    unsigned long fetch_addr, fetch_limit;
>>> -    unsigned int fetch_len, max_len;
>>> +    struct hvm_emulate_ctxt ctxt;
>>> +    struct x86_emulate_state *state;
>>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>>> +    int modrm_mod;
>>>  
>>> +#ifdef NDEBUG
>> Presumably this is just for your testing?
> No, I actually meant it to stay that way. Along the lines of the extra
> debugging code we have in map_domain_page().

I was never very happy with the older version of this debugging.  Surely
in a case like this, we should use the intercept information when
available, and check it against the emulator in a debug build.

That way, we don't entirely change the underlying logic in this function
between a debug and non debug build.

>>> @@ -1658,6 +1662,11 @@ x86_decode_base(
>>>  
>>>      switch ( ctxt->opcode )
>>>      {
>>> +    case 0x90: /* nop / pause */
>>> +        if ( repe_prefix() )
>>> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
>>> +        break;
>> Why is it necessary to special case the rep prefix handling in this case?
> Because SVM's pause intercept should not mistakenly also accept a
> plain NOP.

How about a comment to this effect:

/* Note: Prefixes such as rep/repne are only encoded when semantically
meaningful to the instruction, to reduce the complexity of interpreting
this opcode representation. */

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/17] x86emul: complete decoding of two-byte instructions
  2016-09-27 13:28           ` Andrew Cooper
@ 2016-09-27 13:51             ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-27 13:51 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 27.09.16 at 15:28, <andrew.cooper3@citrix.com> wrote:
> On 26/09/16 08:34, Jan Beulich wrote:
>>
>>> 0F6F was previously ImplicitOps|ModRM, but looks like it should be ModRM 
>>> like the rest of 0F6x.  0F7F, 0FC7 and 0FE7 similarly.
>> Why? As mentioned elsewhere I think the (otherwise benign)
>> ImplicitOps (as well as the individual DstImplicit and SrcImplicit)
>> serve as documentation: Opcodes we actually handle have them
>> specified, whereas opcodes getting decoded but not emulated
>> don't. See the MOVQ and MOVD patches in the other series, which
>> add ImplicitOps to the table entries they add emulation for.
> 
> By that argument, any instruction we have an emulation for should gain
> ImplicitOps.

Unless it has Src* or Dst* specifiers, yes. And I believe that to be
the case.

> As it has the value 0, I only find that it further confuses an already
> complicated piece of logic, as reading the decode gives the false
> impression that something is different.

Well, I wouldn't necessarily mind cleaning this up (albeit I'm also not
fully convinced, as I think this doc aspect has some relevance), but
not in this series.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/17] SVM: use generic instruction decoding
  2016-09-27 13:42       ` Andrew Cooper
@ 2016-09-27 13:56         ` Jan Beulich
  2016-09-27 15:53           ` Andrew Cooper
  0 siblings, 1 reply; 50+ messages in thread
From: Jan Beulich @ 2016-09-27 13:56 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Boris Ostrovsky, Suravee Suthikulpanit

>>> On 27.09.16 at 15:42, <andrew.cooper3@citrix.com> wrote:
> On 15/09/16 07:55, Jan Beulich wrote:
>>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
>>> On 08/09/16 14:14, Jan Beulich wrote:
>>>>  int __get_instruction_length_from_list(struct vcpu *v,
>>>>          const enum instruction_index *list, unsigned int list_count)
>>>>  {
>>>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>>>> -    unsigned int i, j, inst_len = 0;
>>>> -    enum instruction_index instr = 0;
>>>> -    u8 buf[MAX_INST_LEN];
>>>> -    const u8 *opcode = NULL;
>>>> -    unsigned long fetch_addr, fetch_limit;
>>>> -    unsigned int fetch_len, max_len;
>>>> +    struct hvm_emulate_ctxt ctxt;
>>>> +    struct x86_emulate_state *state;
>>>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>>>> +    int modrm_mod;
>>>>  
>>>> +#ifdef NDEBUG
>>> Presumably this is just for your testing?
>> No, I actually meant it to stay that way. Along the lines of the extra
>> debugging code we have in map_domain_page().
> 
> I was never very happy with the older version of this debugging.  Surely
> in a case like this, we should use the intercept information when
> available, and check it against the emulator in a debug build.
> 
> That way, we don't entirely change the underlying logic in this function
> between a debug and non debug build.

But that is exactly what the code is doing:

#ifndef NDEBUG
    if ( vmcb->exitcode == VMEXIT_IOIO )
        j = vmcb->exitinfo2 - vmcb->rip;
    else
        j = svm_nextrip_insn_length(v);
    if ( j && j != inst_len )
    {
        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
                ctxt.ctxt.opcode, inst_len, j);
        return j;
    }
#endif

I.e. in case of a mismatch we use the data from hardware, plus a
message gets logged. In case of a match we further exercise the
opcode lookup logic, which non-debug builds would never hit on
capable hardware.

>>>> @@ -1658,6 +1662,11 @@ x86_decode_base(
>>>>  
>>>>      switch ( ctxt->opcode )
>>>>      {
>>>> +    case 0x90: /* nop / pause */
>>>> +        if ( repe_prefix() )
>>>> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
>>>> +        break;
>>> Why is it necessary to special case the rep prefix handling in this case?
>> Because SVM's pause intercept should not mistakenly also accept a
>> plain NOP.
> 
> How about a comment to this effect:
> 
> /* Note: Prefixes such as rep/repne are only encoded when semantically
> meaningful to the instruction, to reduce the complexity of interpreting
> this opcode representation. */

Yes, I've added a slight variation of this to the definition of
X86EMUL_OPC_PFX_MASK.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 08/17] x86emul: generate and make use of canonical opcode representation
  2016-09-15  6:43     ` Jan Beulich
@ 2016-09-27 14:03       ` Andrew Cooper
  2016-09-28  7:24         ` Jan Beulich
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Cooper @ 2016-09-27 14:03 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 15/09/16 07:43, Jan Beulich wrote:
>>>> On 14.09.16 at 19:30, <andrew.cooper3@citrix.com> wrote:
>>> @@ -435,6 +438,51 @@ struct x86_emulate_ctxt
>>>      void *data;
>>>  };
>>>  
>>> +/*
>>> + * This encodes the opcode extension in a "natural" way:
>> I am not sure what you mean by natural way here.  All you seem to mean
>> is that you are encoding instructions with the following method
> Hence the quotes. Do you have a suggestion for a better word?

It doesn't need qualifying at all.  It is fine to state simply that this
is the representation chosen to be used.

The commit message is the better place to make an argument as to why
this is a sensible representation, but as this comment is simply a
description of the encoding format, the "natural" feels out of place.

>
>>> +#define X86EMUL_OPC_PFX_MASK         0x00000300
>>> +# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
>>> +# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
>>> +# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
>> The PFX mask is moderately obvious from here, but a sentence describing
>> what is legitimate to add in the future wouldn't go amiss.
> I don't understand the "what is legitimate to add in the future"
> part: Nothing should be added to this set.

It occurs to me that using only 2 bits rather than 8 bits for the prefix
information would help the compiler make a smaller switch statements.

>
>>> +
>>> +#define X86EMUL_OPC_KIND_MASK        0x00003000
>>> +#define X86EMUL_OPC_VEX_             0x00001000
>> OTOH, I am rather more confused about what is eligible for inclusion
>> into "kind".  Also, what does a kind of 0 indicate?
> VEX, XOP, and EVEX are the valid non-zero kinds. Zero (I would
> say obviously) means neither of those three.

It is not clear how "kind" is a suitable collective term for VEX/XOP/EVEX.

Or in other words, X86EMUL_OPC_KIND_MASK doesn't provide any hint that
the operation is referring to a legacy or vex encoding of the instruction.

Would s/kind/encoding/ be ok?  At that point, X86EMUL_OPC_LEGACY_ with a
value of 0 might be useful.  (e.g. perhaps (opcode &
X86EMUL_OPC_ENCODING_MASK) == X86EMUL_OPC_LEGACY_?)

>
>>> +# define X86EMUL_OPC_VEX(ext, byte) \
>>> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
>>> +# define X86EMUL_OPC_VEX_66(ext, byte) \
>>> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
>>> +# define X86EMUL_OPC_VEX_F3(ext, byte) \
>>> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
>>> +# define X86EMUL_OPC_VEX_F2(ext, byte) \
>>> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
>>> +#define X86EMUL_OPC_EVEX_            0x00002000
>>> +# define X86EMUL_OPC_EVEX(ext, byte) \
>>> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
>>> +# define X86EMUL_OPC_EVEX_66(ext, byte) \
>>> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
>>> +# define X86EMUL_OPC_EVEX_F3(ext, byte) \
>>> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
>>> +# define X86EMUL_OPC_EVEX_F2(ext, byte) \
>>> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
>> Why do we go to the effort of spelling out the individual VEX/EVEX
>> possibilities, but not the XOP ones?
> Because I need some of them right away, but we currently don't
> emulate any XOP insns. If you feel strongly about it, I surely can
> add XOP ones.

Thats ok - I presume we will be gaining some in due course.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/17] SVM: use generic instruction decoding
  2016-09-27 13:56         ` Jan Beulich
@ 2016-09-27 15:53           ` Andrew Cooper
  0 siblings, 0 replies; 50+ messages in thread
From: Andrew Cooper @ 2016-09-27 15:53 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Boris Ostrovsky, Suravee Suthikulpanit

On 27/09/16 14:56, Jan Beulich wrote:
>>>> On 27.09.16 at 15:42, <andrew.cooper3@citrix.com> wrote:
>> On 15/09/16 07:55, Jan Beulich wrote:
>>>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
>>>> On 08/09/16 14:14, Jan Beulich wrote:
>>>>>  int __get_instruction_length_from_list(struct vcpu *v,
>>>>>          const enum instruction_index *list, unsigned int list_count)
>>>>>  {
>>>>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>>>>> -    unsigned int i, j, inst_len = 0;
>>>>> -    enum instruction_index instr = 0;
>>>>> -    u8 buf[MAX_INST_LEN];
>>>>> -    const u8 *opcode = NULL;
>>>>> -    unsigned long fetch_addr, fetch_limit;
>>>>> -    unsigned int fetch_len, max_len;
>>>>> +    struct hvm_emulate_ctxt ctxt;
>>>>> +    struct x86_emulate_state *state;
>>>>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>>>>> +    int modrm_mod;
>>>>>  
>>>>> +#ifdef NDEBUG
>>>> Presumably this is just for your testing?
>>> No, I actually meant it to stay that way. Along the lines of the extra
>>> debugging code we have in map_domain_page().
>> I was never very happy with the older version of this debugging.  Surely
>> in a case like this, we should use the intercept information when
>> available, and check it against the emulator in a debug build.
>>
>> That way, we don't entirely change the underlying logic in this function
>> between a debug and non debug build.
> But that is exactly what the code is doing:
>
> #ifndef NDEBUG
>     if ( vmcb->exitcode == VMEXIT_IOIO )
>         j = vmcb->exitinfo2 - vmcb->rip;
>     else
>         j = svm_nextrip_insn_length(v);
>     if ( j && j != inst_len )
>     {
>         gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
>                 ctxt.ctxt.opcode, inst_len, j);
>         return j;
>     }
> #endif
>
> I.e. in case of a mismatch we use the data from hardware, plus a
> message gets logged. In case of a match we further exercise the
> opcode lookup logic, which non-debug builds would never hit on
> capable hardware.

Ah yes - I see now.  The split between #ifdef NDEBUG and #ifndef NDEBUG
is the confusing factor.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 08/17] x86emul: generate and make use of canonical opcode representation
  2016-09-27 14:03       ` Andrew Cooper
@ 2016-09-28  7:24         ` Jan Beulich
  0 siblings, 0 replies; 50+ messages in thread
From: Jan Beulich @ 2016-09-28  7:24 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 27.09.16 at 16:03, <andrew.cooper3@citrix.com> wrote:
> On 15/09/16 07:43, Jan Beulich wrote:
>>>>> On 14.09.16 at 19:30, <andrew.cooper3@citrix.com> wrote:
>>>> +#define X86EMUL_OPC_PFX_MASK         0x00000300
>>>> +# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
>>>> +# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
>>>> +# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
>>> The PFX mask is moderately obvious from here, but a sentence describing
>>> what is legitimate to add in the future wouldn't go amiss.
>> I don't understand the "what is legitimate to add in the future"
>> part: Nothing should be added to this set.
> 
> It occurs to me that using only 2 bits rather than 8 bits for the prefix
> information would help the compiler make a smaller switch statements.

I don't think this would help - the compiler struggles with the
high 16 bits, and that wouldn't change. I'm surprised they're not
smart enough to split this into a few compares and a couple of
independent branch tables.

>>>> +#define X86EMUL_OPC_KIND_MASK        0x00003000
>>>> +#define X86EMUL_OPC_VEX_             0x00001000
>>> OTOH, I am rather more confused about what is eligible for inclusion
>>> into "kind".  Also, what does a kind of 0 indicate?
>> VEX, XOP, and EVEX are the valid non-zero kinds. Zero (I would
>> say obviously) means neither of those three.
> 
> It is not clear how "kind" is a suitable collective term for VEX/XOP/EVEX.
> 
> Or in other words, X86EMUL_OPC_KIND_MASK doesn't provide any hint that
> the operation is referring to a legacy or vex encoding of the instruction.
> 
> Would s/kind/encoding/ be ok?

Sure, changed.

> At that point, X86EMUL_OPC_LEGACY_ with a
> value of 0 might be useful.  (e.g. perhaps (opcode &
> X86EMUL_OPC_ENCODING_MASK) == X86EMUL_OPC_LEGACY_?)

Added for completeness (but it'll be unused for now).

>>>> +# define X86EMUL_OPC_VEX(ext, byte) \
>>>> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
>>>> +# define X86EMUL_OPC_VEX_66(ext, byte) \
>>>> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
>>>> +# define X86EMUL_OPC_VEX_F3(ext, byte) \
>>>> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
>>>> +# define X86EMUL_OPC_VEX_F2(ext, byte) \
>>>> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
>>>> +#define X86EMUL_OPC_EVEX_            0x00002000
>>>> +# define X86EMUL_OPC_EVEX(ext, byte) \
>>>> +    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
>>>> +# define X86EMUL_OPC_EVEX_66(ext, byte) \
>>>> +    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
>>>> +# define X86EMUL_OPC_EVEX_F3(ext, byte) \
>>>> +    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
>>>> +# define X86EMUL_OPC_EVEX_F2(ext, byte) \
>>>> +    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
>>> Why do we go to the effort of spelling out the individual VEX/EVEX
>>> possibilities, but not the XOP ones?
>> Because I need some of them right away, but we currently don't
>> emulate any XOP insns. If you feel strongly about it, I surely can
>> add XOP ones.
> 
> Thats ok - I presume we will be gaining some in due course.

Actually I was wrong with the earlier reply - the lack of XOP
counterparts is because they wouldn't get encoded this way.
Instead they'd use X86EMUL_OPC(0x8fXX, 0xYY). Whether a
"shorthand" to make this X86EMUL_OPC_XOP(0xXX, 0xYY) or
X86EMUL_OPC_XOP_XX(0xYY) would be worthwhile I'm not sure
at this point, so I'd rather leave it out until we actually get to
see what's most suitable.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2016-09-28  7:24 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-08 12:58 [PATCH 00/17] x86: split insn emulator decode and execution Jan Beulich
2016-09-08 13:04 ` [PATCH 01/17] x86emul: split instruction decoding from execution Jan Beulich
2016-09-09 18:35   ` Andrew Cooper
2016-09-12  7:20     ` Jan Beulich
2016-09-08 13:07 ` [PATCH 02/17] x86emul: fetch all insn bytes during the decode phase Jan Beulich
2016-09-13 18:44   ` Andrew Cooper
2016-09-14  9:55     ` Jan Beulich
2016-09-23 14:48       ` Andrew Cooper
2016-09-23 15:04         ` Jan Beulich
2016-09-08 13:08 ` [PATCH 04/17] x86emul: track only rIP in emulator state Jan Beulich
2016-09-08 13:23   ` Jan Beulich
2016-09-08 13:09 ` [PATCH 03/17] " Jan Beulich
2016-09-13 19:09   ` Andrew Cooper
2016-09-14  9:58     ` Jan Beulich
2016-09-08 13:10 ` [PATCH 04/17] x86emul: complete decoding of two-byte instructions Jan Beulich
2016-09-14 14:22   ` Andrew Cooper
2016-09-14 15:05     ` Jan Beulich
2016-09-23 16:34       ` Andrew Cooper
2016-09-26  7:34         ` Jan Beulich
2016-09-27 13:28           ` Andrew Cooper
2016-09-27 13:51             ` Jan Beulich
2016-09-08 13:11 ` [PATCH 05/17] x86emul: add XOP decoding Jan Beulich
2016-09-14 16:11   ` Andrew Cooper
2016-09-14 16:21     ` Jan Beulich
2016-09-23 17:01       ` Andrew Cooper
2016-09-08 13:12 ` [PATCH 06/17] x86emul: add EVEX decoding Jan Beulich
2016-09-14 17:05   ` Andrew Cooper
2016-09-15  6:26     ` Jan Beulich
2016-09-08 13:13 ` [PATCH 07/17] x86emul: move x86_execute() common epilogue code Jan Beulich
2016-09-08 13:28   ` Jan Beulich
2016-09-14 17:13   ` Andrew Cooper
2016-09-08 13:14 ` [PATCH 08/17] x86emul: generate and make use of canonical opcode representation Jan Beulich
2016-09-14 17:30   ` Andrew Cooper
2016-09-15  6:43     ` Jan Beulich
2016-09-27 14:03       ` Andrew Cooper
2016-09-28  7:24         ` Jan Beulich
2016-09-08 13:14 ` [PATCH 09/17] SVM: use generic instruction decoding Jan Beulich
2016-09-14 17:56   ` Andrew Cooper
2016-09-15  6:55     ` Jan Beulich
2016-09-27 13:42       ` Andrew Cooper
2016-09-27 13:56         ` Jan Beulich
2016-09-27 15:53           ` Andrew Cooper
2016-09-08 13:16 ` [PATCH 10/17] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
2016-09-08 13:17 ` [PATCH 11/17] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
2016-09-08 13:17 ` [PATCH 12/17] x86/PV: split out dealing with DRn " Jan Beulich
2016-09-08 13:18 ` [PATCH 13/17] x86/PV: split out dealing with MSRs " Jan Beulich
2016-09-08 13:18 ` [PATCH 14/17] x86emul: support XSETBV Jan Beulich
2016-09-08 13:19 ` [PATCH 15/17] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
2016-09-08 13:20 ` [PATCH 16/17] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
2016-09-08 13:21 ` [PATCH 17/17] x86emul: don't assume a memory operand Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.