All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 00/16] x86: split insn emulator decode and execution
@ 2016-09-28  7:59 Jan Beulich
  2016-09-28  8:06 ` [PATCH v2 01/16] x86emul: split instruction decoding from execution Jan Beulich
                   ` (16 more replies)
  0 siblings, 17 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  7:59 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

..., complete the decoder, leverage decoding for SVM instruction
sizing and PV 32-bit call gate emulation, and use the emulator for
PV priv-op handling.

01: x86emul: split instruction decoding from execution
02: x86emul: fetch all insn bytes during the decode phase
03: x86emul: track only rIP in emulator state
04: x86emul: complete decoding of two-byte instructions
05: x86emul: add XOP decoding
06: x86emul: add EVEX decoding
07: x86emul: generate and make use of a canonical opcode representation
08: SVM: use generic instruction decoding
09: x86/32on64: use generic instruction decoding
10: x86/PV: split out dealing with CRn from privileged instruction handling
11: x86/PV: split out dealing with DRn from privileged instruction handling
12: x86/PV: split out dealing with MSRs from privileged instruction handling
13: x86emul: support XSETBV
14: x86emul: sort opcode 0f01 special case switch() statement
15: x86/PV: use generic emulator for privileged instruction handling
16: x86emul: don't assume a memory operand

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Only minor adjustments; see individual patches for details.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 01/16] x86emul: split instruction decoding from execution
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
@ 2016-09-28  8:06 ` Jan Beulich
  2016-09-28 16:24   ` Andrew Cooper
  2016-09-28  8:07 ` [PATCH v2 02/16] x86emul: fetch all insn bytes during the decode phase Jan Beulich
                   ` (15 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:06 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7163 bytes --]

This is only the mechanical part, a subsequent patch will make non-
mechanical adjustments to actually do all decoding in this new
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Fix a coding style issue.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -48,7 +48,9 @@
 /* All operands are implicit in the opcode. */
 #define ImplicitOps (DstImplicit|SrcImplicit)
 
-static uint8_t opcode_table[256] = {
+typedef uint8_t opcode_desc_t;
+
+static const opcode_desc_t opcode_table[256] = {
     /* 0x00 - 0x07 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
@@ -178,7 +180,7 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
 };
 
-static uint8_t twobyte_table[256] = {
+static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
     SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
     /* 0x08 - 0x0F */
@@ -1569,32 +1571,63 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
-int
-x86_emulate(
-    struct x86_emulate_ctxt *ctxt,
-    const struct x86_emulate_ops  *ops)
-{
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs _regs = *ctxt->regs;
+struct x86_emulate_state {
+    unsigned int op_bytes, ad_bytes;
+
+    enum { ext_none, ext_0f, ext_0f38 } ext;
+    uint8_t opcode;
+    uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t rex_prefix;
+    bool lock_prefix;
+    opcode_desc_t desc;
+    union vex vex;
+    int override_seg;
 
-    uint8_t b, d, sib, sib_index, sib_base, rex_prefix = 0;
-    uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-    enum { ext_none, ext_0f, ext_0f38 } ext = ext_none;
-    union vex vex = {};
-    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
-    bool_t lock_prefix = 0;
-    int override_seg = -1, rc = X86EMUL_OKAY;
-    struct operand src = { .reg = REG_POISON };
-    struct operand dst = { .reg = REG_POISON };
-    enum x86_swint_type swint_type;
-    struct x86_emulate_stub stub = {};
-    DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
      * Default is a memory operand relative to segment DS.
      */
-    struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
-    ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
+    struct operand ea;
+
+    /* Immediate operand values, if any. Use otherwise unused fields. */
+#define imm1 ea.val
+#define imm2 ea.orig_val
+
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs regs;
+};
+
+/* Helper definitions. */
+#define op_bytes (state->op_bytes)
+#define ad_bytes (state->ad_bytes)
+#define ext (state->ext)
+#define modrm (state->modrm)
+#define modrm_mod (state->modrm_mod)
+#define modrm_reg (state->modrm_reg)
+#define modrm_rm (state->modrm_rm)
+#define rex_prefix (state->rex_prefix)
+#define lock_prefix (state->lock_prefix)
+#define vex (state->vex)
+#define override_seg (state->override_seg)
+#define ea (state->ea)
+#define _regs (state->regs)
+
+static int
+x86_decode(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops  *ops)
+{
+    uint8_t b, d, sib, sib_index, sib_base;
+    unsigned int def_op_bytes, def_ad_bytes;
+    int rc = X86EMUL_OKAY;
+
+    memset(state, 0, sizeof(*state));
+    override_seg = -1;
+    ea.type = OP_MEM;
+    ea.mem.seg = x86_seg_ds;
+    ea.reg = REG_POISON;
+    _regs = *ctxt->regs;
 
     ctxt->retire.byte = 0;
 
@@ -1811,7 +1844,7 @@ x86_emulate(
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
             default: /* Until it is worth making this table based ... */
-                goto cannot_emulate;
+                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -1943,6 +1976,61 @@ x86_emulate(
     if ( override_seg != -1 && ea.type == OP_MEM )
         ea.mem.seg = override_seg;
 
+    /* Fetch the immediate operand, if present. */
+    switch ( d & SrcMask )
+    {
+        unsigned int bytes;
+
+    case SrcImm:
+        if ( !(d & ByteOp) )
+            bytes = op_bytes != 8 ? op_bytes : 4;
+        else
+        {
+    case SrcImmByte:
+            bytes = 1;
+        }
+        /* NB. Immediates are sign-extended as necessary. */
+        switch ( bytes )
+        {
+        case 1: imm1 = insn_fetch_type(int8_t);  break;
+        case 2: imm1 = insn_fetch_type(int16_t); break;
+        case 4: imm1 = insn_fetch_type(int32_t); break;
+        }
+        break;
+    case SrcImm16:
+        imm1 = insn_fetch_type(uint16_t);
+        break;
+    }
+
+    state->opcode = b;
+    state->desc = d;
+
+ done:
+    return rc;
+}
+
+int
+x86_emulate(
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    struct x86_emulate_state state;
+    int rc;
+    uint8_t b, d;
+    struct operand src = { .reg = REG_POISON };
+    struct operand dst = { .reg = REG_POISON };
+    enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
+    DECLARE_ALIGNED(mmval_t, mmval);
+
+    rc = x86_decode(&state, ctxt, ops);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    b = state.opcode;
+    d = state.desc;
+#define state (&state)
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -1998,18 +2086,12 @@ x86_emulate(
             src.bytes = 1;
         }
         src.type  = OP_IMM;
-        /* NB. Immediates are sign-extended as necessary. */
-        switch ( src.bytes )
-        {
-        case 1: src.val = insn_fetch_type(int8_t);  break;
-        case 2: src.val = insn_fetch_type(int16_t); break;
-        case 4: src.val = insn_fetch_type(int32_t); break;
-        }
+        src.val   = imm1;
         break;
     case SrcImm16:
         src.type  = OP_IMM;
         src.bytes = 2;
-        src.val   = insn_fetch_type(uint16_t);
+        src.val   = imm1;
         break;
     }
 
@@ -4863,8 +4945,8 @@ x86_emulate(
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
-    /* Zero the upper 32 bits of %rip if not in long mode. */
-    if ( def_ad_bytes < sizeof(_regs.eip) )
+    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
+    if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
     *ctxt->regs = _regs;
@@ -4878,4 +4960,19 @@ x86_emulate(
     _put_fpu();
     put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
+#undef state
 }
+
+#undef op_bytes
+#undef ad_bytes
+#undef ext
+#undef modrm
+#undef modrm_mod
+#undef modrm_reg
+#undef modrm_rm
+#undef rex_prefix
+#undef lock_prefix
+#undef vex
+#undef override_seg
+#undef ea
+#undef _regs



[-- Attachment #2: x86emul-split-decode.patch --]
[-- Type: text/plain, Size: 7213 bytes --]

x86emul: split instruction decoding from execution

This is only the mechanical part, a subsequent patch will make non-
mechanical adjustments to actually do all decoding in this new
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Fix a coding style issue.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -48,7 +48,9 @@
 /* All operands are implicit in the opcode. */
 #define ImplicitOps (DstImplicit|SrcImplicit)
 
-static uint8_t opcode_table[256] = {
+typedef uint8_t opcode_desc_t;
+
+static const opcode_desc_t opcode_table[256] = {
     /* 0x00 - 0x07 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
@@ -178,7 +180,7 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
 };
 
-static uint8_t twobyte_table[256] = {
+static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
     SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
     /* 0x08 - 0x0F */
@@ -1569,32 +1571,63 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
-int
-x86_emulate(
-    struct x86_emulate_ctxt *ctxt,
-    const struct x86_emulate_ops  *ops)
-{
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs _regs = *ctxt->regs;
+struct x86_emulate_state {
+    unsigned int op_bytes, ad_bytes;
+
+    enum { ext_none, ext_0f, ext_0f38 } ext;
+    uint8_t opcode;
+    uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t rex_prefix;
+    bool lock_prefix;
+    opcode_desc_t desc;
+    union vex vex;
+    int override_seg;
 
-    uint8_t b, d, sib, sib_index, sib_base, rex_prefix = 0;
-    uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-    enum { ext_none, ext_0f, ext_0f38 } ext = ext_none;
-    union vex vex = {};
-    unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes;
-    bool_t lock_prefix = 0;
-    int override_seg = -1, rc = X86EMUL_OKAY;
-    struct operand src = { .reg = REG_POISON };
-    struct operand dst = { .reg = REG_POISON };
-    enum x86_swint_type swint_type;
-    struct x86_emulate_stub stub = {};
-    DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
      * Default is a memory operand relative to segment DS.
      */
-    struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
-    ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
+    struct operand ea;
+
+    /* Immediate operand values, if any. Use otherwise unused fields. */
+#define imm1 ea.val
+#define imm2 ea.orig_val
+
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs regs;
+};
+
+/* Helper definitions. */
+#define op_bytes (state->op_bytes)
+#define ad_bytes (state->ad_bytes)
+#define ext (state->ext)
+#define modrm (state->modrm)
+#define modrm_mod (state->modrm_mod)
+#define modrm_reg (state->modrm_reg)
+#define modrm_rm (state->modrm_rm)
+#define rex_prefix (state->rex_prefix)
+#define lock_prefix (state->lock_prefix)
+#define vex (state->vex)
+#define override_seg (state->override_seg)
+#define ea (state->ea)
+#define _regs (state->regs)
+
+static int
+x86_decode(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops  *ops)
+{
+    uint8_t b, d, sib, sib_index, sib_base;
+    unsigned int def_op_bytes, def_ad_bytes;
+    int rc = X86EMUL_OKAY;
+
+    memset(state, 0, sizeof(*state));
+    override_seg = -1;
+    ea.type = OP_MEM;
+    ea.mem.seg = x86_seg_ds;
+    ea.reg = REG_POISON;
+    _regs = *ctxt->regs;
 
     ctxt->retire.byte = 0;
 
@@ -1811,7 +1844,7 @@ x86_emulate(
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
             default: /* Until it is worth making this table based ... */
-                goto cannot_emulate;
+                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -1943,6 +1976,61 @@ x86_emulate(
     if ( override_seg != -1 && ea.type == OP_MEM )
         ea.mem.seg = override_seg;
 
+    /* Fetch the immediate operand, if present. */
+    switch ( d & SrcMask )
+    {
+        unsigned int bytes;
+
+    case SrcImm:
+        if ( !(d & ByteOp) )
+            bytes = op_bytes != 8 ? op_bytes : 4;
+        else
+        {
+    case SrcImmByte:
+            bytes = 1;
+        }
+        /* NB. Immediates are sign-extended as necessary. */
+        switch ( bytes )
+        {
+        case 1: imm1 = insn_fetch_type(int8_t);  break;
+        case 2: imm1 = insn_fetch_type(int16_t); break;
+        case 4: imm1 = insn_fetch_type(int32_t); break;
+        }
+        break;
+    case SrcImm16:
+        imm1 = insn_fetch_type(uint16_t);
+        break;
+    }
+
+    state->opcode = b;
+    state->desc = d;
+
+ done:
+    return rc;
+}
+
+int
+x86_emulate(
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    struct x86_emulate_state state;
+    int rc;
+    uint8_t b, d;
+    struct operand src = { .reg = REG_POISON };
+    struct operand dst = { .reg = REG_POISON };
+    enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
+    DECLARE_ALIGNED(mmval_t, mmval);
+
+    rc = x86_decode(&state, ctxt, ops);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    b = state.opcode;
+    d = state.desc;
+#define state (&state)
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -1998,18 +2086,12 @@ x86_emulate(
             src.bytes = 1;
         }
         src.type  = OP_IMM;
-        /* NB. Immediates are sign-extended as necessary. */
-        switch ( src.bytes )
-        {
-        case 1: src.val = insn_fetch_type(int8_t);  break;
-        case 2: src.val = insn_fetch_type(int16_t); break;
-        case 4: src.val = insn_fetch_type(int32_t); break;
-        }
+        src.val   = imm1;
         break;
     case SrcImm16:
         src.type  = OP_IMM;
         src.bytes = 2;
-        src.val   = insn_fetch_type(uint16_t);
+        src.val   = imm1;
         break;
     }
 
@@ -4863,8 +4945,8 @@ x86_emulate(
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
-    /* Zero the upper 32 bits of %rip if not in long mode. */
-    if ( def_ad_bytes < sizeof(_regs.eip) )
+    /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
+    if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
     *ctxt->regs = _regs;
@@ -4878,4 +4960,19 @@ x86_emulate(
     _put_fpu();
     put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
+#undef state
 }
+
+#undef op_bytes
+#undef ad_bytes
+#undef ext
+#undef modrm
+#undef modrm_mod
+#undef modrm_reg
+#undef modrm_rm
+#undef rex_prefix
+#undef lock_prefix
+#undef vex
+#undef override_seg
+#undef ea
+#undef _regs

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 02/16] x86emul: fetch all insn bytes during the decode phase
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
  2016-09-28  8:06 ` [PATCH v2 01/16] x86emul: split instruction decoding from execution Jan Beulich
@ 2016-09-28  8:07 ` Jan Beulich
  2016-09-28 16:37   ` Andrew Cooper
  2016-09-28  8:08 ` [PATCH v2 03/16] x86emul: track only rIP in emulator state Jan Beulich
                   ` (14 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 6736 bytes --]

This way we can offer to callers the service of just sizing
instructions, and we also can better guarantee not to raise the wrong
fault due to not having read all relevant bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Rename x86_decode_base() -> x86_decode_onebyte().

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -129,8 +129,8 @@ static const opcode_desc_t opcode_table[
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, ImplicitOps,
     /* 0xA0 - 0xA7 */
-    ByteOp|DstEax|SrcImplicit|Mov, DstEax|SrcImplicit|Mov,
-    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|DstEax|SrcMem|Mov, DstEax|SrcMem|Mov,
+    ByteOp|DstMem|SrcEax|Mov, DstMem|SrcEax|Mov,
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps, ImplicitOps,
     /* 0xA8 - 0xAF */
@@ -1613,6 +1613,45 @@ struct x86_emulate_state {
 #define _regs (state->regs)
 
 static int
+x86_decode_onebyte(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x9a: /* call (far, absolute) */
+    case 0xea: /* jmp (far, absolute) */
+        generate_exception_if(mode_64bit(), EXC_UD, -1);
+
+        imm1 = insn_fetch_bytes(op_bytes);
+        imm2 = insn_fetch_type(uint16_t);
+        break;
+
+    case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
+        /* Source EA is not encoded via ModRM. */
+        ea.mem.off = insn_fetch_bytes(ad_bytes);
+        break;
+
+    case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
+        if ( op_bytes == 8 ) /* Fetch more bytes to obtain imm64. */
+            imm1 = ((uint32_t)imm1 |
+                    ((uint64_t)insn_fetch_type(uint32_t) << 32));
+        break;
+
+    case 0xc8: /* enter imm16,imm8 */
+        imm2 = insn_fetch_type(uint8_t);
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -2005,10 +2044,29 @@ x86_decode(
     state->opcode = b;
     state->desc = d;
 
+    switch ( ext )
+    {
+    case ext_none:
+        rc = x86_decode_onebyte(state, ctxt, ops);
+        break;
+
+    case ext_0f:
+    case ext_0f38:
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
  done:
     return rc;
 }
 
+/* No insn fetching past this point. */
+#undef insn_fetch_bytes
+#undef insn_fetch_type
+
 int
 x86_emulate(
     struct x86_emulate_ctxt *ctxt,
@@ -2571,6 +2629,8 @@ x86_emulate(
     case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
         generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1);
     case 0x88 ... 0x8b: /* mov */
+    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         dst.val = src.val;
         break;
 
@@ -2655,18 +2715,13 @@ x86_emulate(
 
     case 0x9a: /* call (far, absolute) */ {
         struct segment_register reg;
-        uint16_t sel;
-        uint32_t eip;
 
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
+        ASSERT(!mode_64bit());
         fail_if(ops->read_segment == NULL);
 
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-
         if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
-             (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (validate_far_branch(&cs, eip),
+             (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (validate_far_branch(&cs, imm1),
               rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                               &reg.sel, op_bytes, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
@@ -2674,7 +2729,7 @@ x86_emulate(
              (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) )
             goto done;
 
-        _regs.eip = eip;
+        _regs.eip = imm1;
         break;
     }
 
@@ -2716,23 +2771,6 @@ x86_emulate(
         ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02;
         break;
 
-    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
-        /* Source EA is not encoded via ModRM. */
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
-                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
-            goto done;
-        break;
-
-    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
-        /* Destination EA is not encoded via ModRM. */
-        dst.type  = OP_MEM;
-        dst.mem.seg = ea.mem.seg;
-        dst.mem.off = insn_fetch_bytes(ad_bytes);
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        dst.val   = (unsigned long)_regs.eax;
-        break;
-
     case 0xa4 ... 0xa5: /* movs */ {
         unsigned long nr_reps = get_rep_prefix();
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
@@ -2850,9 +2888,6 @@ x86_emulate(
         break;
 
     case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
-        if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */
-            src.val = ((uint32_t)src.val |
-                       ((uint64_t)insn_fetch_type(uint32_t) << 32));
         dst.reg = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
         dst.val = src.val;
@@ -2916,7 +2951,7 @@ x86_emulate(
         goto les;
 
     case 0xc8: /* enter imm16,imm8 */ {
-        uint8_t depth = insn_fetch_type(uint8_t) & 31;
+        uint8_t depth = imm2 & 31;
         int i;
 
         dst.type = OP_REG;
@@ -3629,17 +3664,12 @@ x86_emulate(
         jmp_rel((int32_t)src.val);
         break;
 
-    case 0xea: /* jmp (far, absolute) */ {
-        uint16_t sel;
-        uint32_t eip;
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-        if ( (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (rc = commit_far_branch(&cs, eip)) )
+    case 0xea: /* jmp (far, absolute) */
+        ASSERT(!mode_64bit());
+        if ( (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (rc = commit_far_branch(&cs, imm1)) )
             goto done;
         break;
-    }
 
     case 0xf1: /* int1 (icebp) */
         src.val = EXC_DB;



[-- Attachment #2: x86emul-decode-onebyte.patch --]
[-- Type: text/plain, Size: 6789 bytes --]

x86emul: fetch all insn bytes during the decode phase

This way we can offer to callers the service of just sizing
instructions, and we also can better guarantee not to raise the wrong
fault due to not having read all relevant bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Rename x86_decode_base() -> x86_decode_onebyte().

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -129,8 +129,8 @@ static const opcode_desc_t opcode_table[
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, ImplicitOps,
     /* 0xA0 - 0xA7 */
-    ByteOp|DstEax|SrcImplicit|Mov, DstEax|SrcImplicit|Mov,
-    ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+    ByteOp|DstEax|SrcMem|Mov, DstEax|SrcMem|Mov,
+    ByteOp|DstMem|SrcEax|Mov, DstMem|SrcEax|Mov,
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps, ImplicitOps,
     /* 0xA8 - 0xAF */
@@ -1613,6 +1613,45 @@ struct x86_emulate_state {
 #define _regs (state->regs)
 
 static int
+x86_decode_onebyte(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x9a: /* call (far, absolute) */
+    case 0xea: /* jmp (far, absolute) */
+        generate_exception_if(mode_64bit(), EXC_UD, -1);
+
+        imm1 = insn_fetch_bytes(op_bytes);
+        imm2 = insn_fetch_type(uint16_t);
+        break;
+
+    case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
+        /* Source EA is not encoded via ModRM. */
+        ea.mem.off = insn_fetch_bytes(ad_bytes);
+        break;
+
+    case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
+        if ( op_bytes == 8 ) /* Fetch more bytes to obtain imm64. */
+            imm1 = ((uint32_t)imm1 |
+                    ((uint64_t)insn_fetch_type(uint32_t) << 32));
+        break;
+
+    case 0xc8: /* enter imm16,imm8 */
+        imm2 = insn_fetch_type(uint8_t);
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -2005,10 +2044,29 @@ x86_decode(
     state->opcode = b;
     state->desc = d;
 
+    switch ( ext )
+    {
+    case ext_none:
+        rc = x86_decode_onebyte(state, ctxt, ops);
+        break;
+
+    case ext_0f:
+    case ext_0f38:
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
  done:
     return rc;
 }
 
+/* No insn fetching past this point. */
+#undef insn_fetch_bytes
+#undef insn_fetch_type
+
 int
 x86_emulate(
     struct x86_emulate_ctxt *ctxt,
@@ -2571,6 +2629,8 @@ x86_emulate(
     case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
         generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1);
     case 0x88 ... 0x8b: /* mov */
+    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
+    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         dst.val = src.val;
         break;
 
@@ -2655,18 +2715,13 @@ x86_emulate(
 
     case 0x9a: /* call (far, absolute) */ {
         struct segment_register reg;
-        uint16_t sel;
-        uint32_t eip;
 
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
+        ASSERT(!mode_64bit());
         fail_if(ops->read_segment == NULL);
 
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-
         if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
-             (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (validate_far_branch(&cs, eip),
+             (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (validate_far_branch(&cs, imm1),
               rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                               &reg.sel, op_bytes, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
@@ -2674,7 +2729,7 @@ x86_emulate(
              (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) )
             goto done;
 
-        _regs.eip = eip;
+        _regs.eip = imm1;
         break;
     }
 
@@ -2716,23 +2771,6 @@ x86_emulate(
         ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02;
         break;
 
-    case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
-        /* Source EA is not encoded via ModRM. */
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
-                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
-            goto done;
-        break;
-
-    case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
-        /* Destination EA is not encoded via ModRM. */
-        dst.type  = OP_MEM;
-        dst.mem.seg = ea.mem.seg;
-        dst.mem.off = insn_fetch_bytes(ad_bytes);
-        dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        dst.val   = (unsigned long)_regs.eax;
-        break;
-
     case 0xa4 ... 0xa5: /* movs */ {
         unsigned long nr_reps = get_rep_prefix();
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
@@ -2850,9 +2888,6 @@ x86_emulate(
         break;
 
     case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */
-        if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */
-            src.val = ((uint32_t)src.val |
-                       ((uint64_t)insn_fetch_type(uint32_t) << 32));
         dst.reg = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
         dst.val = src.val;
@@ -2916,7 +2951,7 @@ x86_emulate(
         goto les;
 
     case 0xc8: /* enter imm16,imm8 */ {
-        uint8_t depth = insn_fetch_type(uint8_t) & 31;
+        uint8_t depth = imm2 & 31;
         int i;
 
         dst.type = OP_REG;
@@ -3629,17 +3664,12 @@ x86_emulate(
         jmp_rel((int32_t)src.val);
         break;
 
-    case 0xea: /* jmp (far, absolute) */ {
-        uint16_t sel;
-        uint32_t eip;
-        generate_exception_if(mode_64bit(), EXC_UD, -1);
-        eip = insn_fetch_bytes(op_bytes);
-        sel = insn_fetch_type(uint16_t);
-        if ( (rc = load_seg(x86_seg_cs, sel, 0, &cs, ctxt, ops)) ||
-             (rc = commit_far_branch(&cs, eip)) )
+    case 0xea: /* jmp (far, absolute) */
+        ASSERT(!mode_64bit());
+        if ( (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
+             (rc = commit_far_branch(&cs, imm1)) )
             goto done;
         break;
-    }
 
     case 0xf1: /* int1 (icebp) */
         src.val = EXC_DB;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 03/16] x86emul: track only rIP in emulator state
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
  2016-09-28  8:06 ` [PATCH v2 01/16] x86emul: split instruction decoding from execution Jan Beulich
  2016-09-28  8:07 ` [PATCH v2 02/16] x86emul: fetch all insn bytes during the decode phase Jan Beulich
@ 2016-09-28  8:08 ` Jan Beulich
  2016-09-28 16:41   ` Andrew Cooper
  2016-09-28  8:08 ` [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions Jan Beulich
                   ` (13 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:08 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7890 bytes --]

Now that all decoding happens in x86_decode() there's no need to keep
the local registers copy in struct x86_emulate_state. Only rIP gets
updated in the decode phase, so only that register needs tracking
there. All other (read-only) registers can be read from the original
structure (but sadly, due to it getting passed to decode_register(),
the pointer can't be made point to "const" to make the compiler help
ensure no modification happens).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Defer decoding register of the ModRM mod 3 case until the
    execution phase.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -590,9 +590,9 @@ do{ asm volatile (
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x = 0, _eip = _regs.eip;                              \
-   _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip -                          \
+({ unsigned long _x = 0, _eip = state->eip;                             \
+   state->eip += (_size); /* real hardware doesn't truncate */          \
+   generate_exception_if((uint8_t)(state->eip -                         \
                                    ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
@@ -1593,8 +1593,8 @@ struct x86_emulate_state {
 #define imm1 ea.val
 #define imm2 ea.orig_val
 
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs regs;
+    unsigned long eip;
+    struct cpu_user_regs *regs;
 };
 
 /* Helper definitions. */
@@ -1610,7 +1610,6 @@ struct x86_emulate_state {
 #define vex (state->vex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
-#define _regs (state->regs)
 
 static int
 x86_decode_onebyte(
@@ -1666,7 +1665,8 @@ x86_decode(
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
-    _regs = *ctxt->regs;
+    state->regs = ctxt->regs;
+    state->eip = ctxt->regs->eip;
 
     ctxt->retire.byte = 0;
 
@@ -1770,7 +1770,7 @@ x86_decode(
             default:
                 BUG();
             case 2:
-                if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) )
+                if ( in_realmode(ctxt, ops) || (state->regs->eflags & EFLG_VM) )
                     break;
                 /* fall through */
             case 4:
@@ -1895,8 +1895,6 @@ x86_decode(
         {
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
-            ea.reg  = decode_register(
-                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
         }
         else if ( ad_bytes == 2 )
         {
@@ -1904,33 +1902,33 @@ x86_decode(
             switch ( modrm_rm )
             {
             case 0:
-                ea.mem.off = _regs.ebx + _regs.esi;
+                ea.mem.off = state->regs->ebx + state->regs->esi;
                 break;
             case 1:
-                ea.mem.off = _regs.ebx + _regs.edi;
+                ea.mem.off = state->regs->ebx + state->regs->edi;
                 break;
             case 2:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.esi;
+                ea.mem.off = state->regs->ebp + state->regs->esi;
                 break;
             case 3:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.edi;
+                ea.mem.off = state->regs->ebp + state->regs->edi;
                 break;
             case 4:
-                ea.mem.off = _regs.esi;
+                ea.mem.off = state->regs->esi;
                 break;
             case 5:
-                ea.mem.off = _regs.edi;
+                ea.mem.off = state->regs->edi;
                 break;
             case 6:
                 if ( modrm_mod == 0 )
                     break;
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp;
+                ea.mem.off = state->regs->ebp;
                 break;
             case 7:
-                ea.mem.off = _regs.ebx;
+                ea.mem.off = state->regs->ebx;
                 break;
             }
             switch ( modrm_mod )
@@ -1957,14 +1955,15 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 )
-                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                    ea.mem.off = *(long *)decode_register(sib_index,
+                                                          state->regs, 0);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.esp;
+                    ea.mem.off += state->regs->esp;
                     if ( !ext && (b == 0x8f) )
                         /* POP <rm> computes its EA post increment. */
                         ea.mem.off += ((mode_64bit() && (op_bytes == 4))
@@ -1973,15 +1972,17 @@ x86_decode(
                 else if ( sib_base == 5 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.ebp;
+                    ea.mem.off += state->regs->ebp;
                 }
                 else
-                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+                    ea.mem.off += *(long *)decode_register(sib_base,
+                                                           state->regs, 0);
             }
             else
             {
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                ea.mem.off = *(long *)decode_register(modrm_rm,
+                                                      state->regs, 0);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -1994,7 +1995,7 @@ x86_decode(
                 if ( !mode_64bit() )
                     break;
                 /* Relative to RIP of next instruction. Argh! */
-                ea.mem.off += _regs.eip;
+                ea.mem.off += state->eip;
                 if ( (d & SrcMask) == SrcImm )
                     ea.mem.off += (d & ByteOp) ? 1 :
                         ((op_bytes == 8) ? 4 : op_bytes);
@@ -2072,6 +2073,8 @@ x86_emulate(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
@@ -2085,10 +2088,17 @@ x86_emulate(
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    /* Sync rIP to post decode value. */
+    _regs.eip = state.eip;
+
     b = state.opcode;
     d = state.desc;
 #define state (&state)
 
+    if ( ea.type == OP_REG )
+        ea.reg = decode_register(modrm_rm, &_regs,
+                                 (d & ByteOp) && !rex_prefix);
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -5005,4 +5015,3 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
-#undef _regs



[-- Attachment #2: x86emul-decode-regs-pointer.patch --]
[-- Type: text/plain, Size: 7931 bytes --]

x86emul: track only rIP in emulator state

Now that all decoding happens in x86_decode() there's no need to keep
the local registers copy in struct x86_emulate_state. Only rIP gets
updated in the decode phase, so only that register needs tracking
there. All other (read-only) registers can be read from the original
structure (but sadly, due to it getting passed to decode_register(),
the pointer can't be made point to "const" to make the compiler help
ensure no modification happens).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Defer decoding register of the ModRM mod 3 case until the
    execution phase.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -590,9 +590,9 @@ do{ asm volatile (
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x = 0, _eip = _regs.eip;                              \
-   _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip -                          \
+({ unsigned long _x = 0, _eip = state->eip;                             \
+   state->eip += (_size); /* real hardware doesn't truncate */          \
+   generate_exception_if((uint8_t)(state->eip -                         \
                                    ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
@@ -1593,8 +1593,8 @@ struct x86_emulate_state {
 #define imm1 ea.val
 #define imm2 ea.orig_val
 
-    /* Shadow copy of register state. Committed on successful emulation. */
-    struct cpu_user_regs regs;
+    unsigned long eip;
+    struct cpu_user_regs *regs;
 };
 
 /* Helper definitions. */
@@ -1610,7 +1610,6 @@ struct x86_emulate_state {
 #define vex (state->vex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
-#define _regs (state->regs)
 
 static int
 x86_decode_onebyte(
@@ -1666,7 +1665,8 @@ x86_decode(
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
     ea.reg = REG_POISON;
-    _regs = *ctxt->regs;
+    state->regs = ctxt->regs;
+    state->eip = ctxt->regs->eip;
 
     ctxt->retire.byte = 0;
 
@@ -1770,7 +1770,7 @@ x86_decode(
             default:
                 BUG();
             case 2:
-                if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) )
+                if ( in_realmode(ctxt, ops) || (state->regs->eflags & EFLG_VM) )
                     break;
                 /* fall through */
             case 4:
@@ -1895,8 +1895,6 @@ x86_decode(
         {
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
-            ea.reg  = decode_register(
-                modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0));
         }
         else if ( ad_bytes == 2 )
         {
@@ -1904,33 +1902,33 @@ x86_decode(
             switch ( modrm_rm )
             {
             case 0:
-                ea.mem.off = _regs.ebx + _regs.esi;
+                ea.mem.off = state->regs->ebx + state->regs->esi;
                 break;
             case 1:
-                ea.mem.off = _regs.ebx + _regs.edi;
+                ea.mem.off = state->regs->ebx + state->regs->edi;
                 break;
             case 2:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.esi;
+                ea.mem.off = state->regs->ebp + state->regs->esi;
                 break;
             case 3:
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp + _regs.edi;
+                ea.mem.off = state->regs->ebp + state->regs->edi;
                 break;
             case 4:
-                ea.mem.off = _regs.esi;
+                ea.mem.off = state->regs->esi;
                 break;
             case 5:
-                ea.mem.off = _regs.edi;
+                ea.mem.off = state->regs->edi;
                 break;
             case 6:
                 if ( modrm_mod == 0 )
                     break;
                 ea.mem.seg = x86_seg_ss;
-                ea.mem.off = _regs.ebp;
+                ea.mem.off = state->regs->ebp;
                 break;
             case 7:
-                ea.mem.off = _regs.ebx;
+                ea.mem.off = state->regs->ebx;
                 break;
             }
             switch ( modrm_mod )
@@ -1957,14 +1955,15 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 )
-                    ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0);
+                    ea.mem.off = *(long *)decode_register(sib_index,
+                                                          state->regs, 0);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.esp;
+                    ea.mem.off += state->regs->esp;
                     if ( !ext && (b == 0x8f) )
                         /* POP <rm> computes its EA post increment. */
                         ea.mem.off += ((mode_64bit() && (op_bytes == 4))
@@ -1973,15 +1972,17 @@ x86_decode(
                 else if ( sib_base == 5 )
                 {
                     ea.mem.seg  = x86_seg_ss;
-                    ea.mem.off += _regs.ebp;
+                    ea.mem.off += state->regs->ebp;
                 }
                 else
-                    ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0);
+                    ea.mem.off += *(long *)decode_register(sib_base,
+                                                           state->regs, 0);
             }
             else
             {
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0);
+                ea.mem.off = *(long *)decode_register(modrm_rm,
+                                                      state->regs, 0);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -1994,7 +1995,7 @@ x86_decode(
                 if ( !mode_64bit() )
                     break;
                 /* Relative to RIP of next instruction. Argh! */
-                ea.mem.off += _regs.eip;
+                ea.mem.off += state->eip;
                 if ( (d & SrcMask) == SrcImm )
                     ea.mem.off += (d & ByteOp) ? 1 :
                         ((op_bytes == 8) ? 4 : op_bytes);
@@ -2072,6 +2073,8 @@ x86_emulate(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    /* Shadow copy of register state. Committed on successful emulation. */
+    struct cpu_user_regs _regs = *ctxt->regs;
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
@@ -2085,10 +2088,17 @@ x86_emulate(
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    /* Sync rIP to post decode value. */
+    _regs.eip = state.eip;
+
     b = state.opcode;
     d = state.desc;
 #define state (&state)
 
+    if ( ea.type == OP_REG )
+        ea.reg = decode_register(modrm_rm, &_regs,
+                                 (d & ByteOp) && !rex_prefix);
+
     /* Decode and fetch the source operand: register, memory or immediate. */
     switch ( d & SrcMask )
     {
@@ -5005,4 +5015,3 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
-#undef _regs

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (2 preceding siblings ...)
  2016-09-28  8:08 ` [PATCH v2 03/16] x86emul: track only rIP in emulator state Jan Beulich
@ 2016-09-28  8:08 ` Jan Beulich
  2016-09-28 17:22   ` Andrew Cooper
  2016-09-28  8:09 ` [PATCH v2 05/16] x86emul: add XOP decoding Jan Beulich
                   ` (12 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:08 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 9520 bytes --]

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

This at once adds correct raising of #UD for the three "ud<n>" flavors
(Intel names only "ud2", but AMD names all three of them in their
opcode maps), as that may make a difference to callers compared to
getting back X86EMUL_UNHANDLEABLE.

Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
which have a ModRM like byte where only register forms are valid. I.e.
we could also use SrcImmByte there, but ModRM is more likely to be
correct for a hypothetical extension allowing non-register operations.

Note on opcode 0FB8: I think we're safe to ignore the Itanium specific
JMPE (which doesn't take a ModRM byte, but an immediate).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -182,11 +182,14 @@ static const opcode_desc_t opcode_table[
 
 static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
-    SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
+    SrcMem16|ModRM, ImplicitOps|ModRM, ModRM, ModRM,
+    0, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0x08 - 0x0F */
-    ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
+    0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
     /* 0x10 - 0x17 */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x18 - 0x1F */
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
@@ -194,12 +197,13 @@ static const opcode_desc_t twobyte_table
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     0, 0, 0, 0,
     /* 0x28 - 0x2F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, ImplicitOps|ModRM, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x30 - 0x37 */
-    ImplicitOps, ImplicitOps, ImplicitOps, 0,
-    ImplicitOps, ImplicitOps, 0, 0,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
     /* 0x38 - 0x3F */
-    DstReg|SrcMem|ModRM, 0, 0, 0, 0, 0, 0, 0,
+    DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
     /* 0x40 - 0x47 */
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
@@ -211,11 +215,15 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     /* 0x50 - 0x5F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0x60 - 0x6F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x70 - 0x7F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
+    ModRM, ModRM, ModRM, ImplicitOps,
+    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     DstImplicit|SrcImm, DstImplicit|SrcImm,
     DstImplicit|SrcImm, DstImplicit|SrcImm,
@@ -238,9 +246,9 @@ static const opcode_desc_t twobyte_table
     ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
     /* 0xA0 - 0xA7 */
     ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
     /* 0xA8 - 0xAF */
-    ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
+    ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
     DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
     ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
     /* 0xB0 - 0xB7 */
@@ -249,22 +257,26 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xB8 - 0xBF */
-    0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
+    DstReg|SrcMem|ModRM, ModRM,
+    DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
     DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xC0 - 0xC7 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    0, DstMem|SrcReg|ModRM|Mov,
-    0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
     /* 0xC8 - 0xCF */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xDF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xE0 - 0xEF */
-    0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xF0 - 0xFF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
 #define REX_PREFIX 0x40
@@ -1574,7 +1586,12 @@ int x86emul_unhandleable_rw(
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
-    enum { ext_none, ext_0f, ext_0f38 } ext;
+    enum {
+        ext_none = vex_none,
+        ext_0f   = vex_0f,
+        ext_0f38 = vex_0f38,
+        ext_0f3a = vex_0f3a,
+    } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
@@ -1651,6 +1668,34 @@ x86_decode_onebyte(
 }
 
 static int
+x86_decode_twobyte(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x78:
+        if ( vex.opcx )
+            break;
+        switch ( vex.pfx )
+        {
+        case vex_66: /* extrq $imm8, $imm8, xmm */
+        case vex_f2: /* insertq $imm8, $imm8, xmm, xmm */
+            imm1 = insn_fetch_type(uint8_t);
+            imm2 = insn_fetch_type(uint8_t);
+            break;
+        }
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -1754,6 +1799,10 @@ x86_decode(
                 b = insn_fetch_type(uint8_t);
                 ext = ext_0f38;
                 break;
+            case 0x3a:
+                b = insn_fetch_type(uint8_t);
+                ext = ext_0f3a;
+                break;
             }
         }
     }
@@ -1809,10 +1858,22 @@ x86_decode(
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
-                fail_if(vex.opcx != vex_0f);
-                ext = ext_0f;
                 b = insn_fetch_type(uint8_t);
-                d = twobyte_table[b];
+                switch ( ext = vex.opcx )
+                {
+                case vex_0f:
+                    d = twobyte_table[b];
+                    break;
+                case vex_0f38:
+                    d = twobyte_table[0x38];
+                    break;
+                case vex_0f3a:
+                    d = twobyte_table[0x3a];
+                    break;
+                default:
+                    rc = X86EMUL_UNHANDLEABLE;
+                    goto done;
+                }
 
                 modrm = insn_fetch_type(uint8_t);
                 modrm_mod = (modrm & 0xc0) >> 6;
@@ -1870,9 +1931,12 @@ x86_decode(
             break;
 
         case ext_0f:
+        case ext_0f3a:
             break;
 
         case ext_0f38:
+            if ( vex.opcx )
+                break;
             switch ( b )
             {
             case 0xf0: /* movbe / crc32 */
@@ -2052,7 +2116,11 @@ x86_decode(
         break;
 
     case ext_0f:
+        rc = x86_decode_twobyte(state, ctxt, ops);
+        break;
+
     case ext_0f38:
+    case ext_0f3a:
         break;
 
     default:
@@ -2268,6 +2336,7 @@ x86_emulate(
         goto ext_0f38_insn;
     default:
         ASSERT_UNREACHABLE();
+    case ext_0f3a:
         goto cannot_emulate;
     }
 
@@ -4211,6 +4280,11 @@ x86_emulate(
             goto done;
         break;
 
+    case 0x0b: /* ud2 */
+    case 0xb9: /* ud1 */
+    case 0xff: /* ud0 */
+        generate_exception_if(1, EXC_UD, -1);
+
     case 0x0d: /* GrpP (prefetch) */
     case 0x18: /* Grp16 (prefetch/nop) */
     case 0x19 ... 0x1f: /* nop (amd-defined) */



[-- Attachment #2: x86emul-decode-twobyte.patch --]
[-- Type: text/plain, Size: 9571 bytes --]

x86emul: complete decoding of two-byte instructions

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

This at once adds correct raising of #UD for the three "ud<n>" flavors
(Intel names only "ud2", but AMD names all three of them in their
opcode maps), as that may make a difference to callers compared to
getting back X86EMUL_UNHANDLEABLE.

Note on opcodes 0FA6 and 0FA7: These are VIA's PadLock instructions,
which have a ModRM like byte where only register forms are valid. I.e.
we could also use SrcImmByte there, but ModRM is more likely to be
correct for a hypothetical extension allowing non-register operations.

Note on opcode 0FB8: I think we're safe to ignore the Itanium specific
JMPE (which doesn't take a ModRM byte, but an immediate).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -182,11 +182,14 @@ static const opcode_desc_t opcode_table[
 
 static const opcode_desc_t twobyte_table[256] = {
     /* 0x00 - 0x07 */
-    SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
+    SrcMem16|ModRM, ImplicitOps|ModRM, ModRM, ModRM,
+    0, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0x08 - 0x0F */
-    ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
+    0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
     /* 0x10 - 0x17 */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x18 - 0x1F */
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
@@ -194,12 +197,13 @@ static const opcode_desc_t twobyte_table
     ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     0, 0, 0, 0,
     /* 0x28 - 0x2F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, 0, ImplicitOps|ModRM, 0, 0, 0, 0,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
+    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x30 - 0x37 */
-    ImplicitOps, ImplicitOps, ImplicitOps, 0,
-    ImplicitOps, ImplicitOps, 0, 0,
+    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+    ImplicitOps, ImplicitOps, 0, ImplicitOps,
     /* 0x38 - 0x3F */
-    DstReg|SrcMem|ModRM, 0, 0, 0, 0, 0, 0, 0,
+    DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
     /* 0x40 - 0x47 */
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
@@ -211,11 +215,15 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     /* 0x50 - 0x5F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0x60 - 0x6F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x70 - 0x7F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
+    ModRM, ModRM, ModRM, ImplicitOps,
+    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     DstImplicit|SrcImm, DstImplicit|SrcImm,
     DstImplicit|SrcImm, DstImplicit|SrcImm,
@@ -238,9 +246,9 @@ static const opcode_desc_t twobyte_table
     ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
     /* 0xA0 - 0xA7 */
     ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
     /* 0xA8 - 0xAF */
-    ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
+    ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
     DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
     ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
     /* 0xB0 - 0xB7 */
@@ -249,22 +257,26 @@ static const opcode_desc_t twobyte_table
     DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xB8 - 0xBF */
-    0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
+    DstReg|SrcMem|ModRM, ModRM,
+    DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
     DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xC0 - 0xC7 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    0, DstMem|SrcReg|ModRM|Mov,
-    0, 0, 0, ImplicitOps|ModRM,
+    SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
+    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
     /* 0xC8 - 0xCF */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xDF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xE0 - 0xEF */
-    0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
     /* 0xF0 - 0xFF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
+    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
 #define REX_PREFIX 0x40
@@ -1574,7 +1586,12 @@ int x86emul_unhandleable_rw(
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
-    enum { ext_none, ext_0f, ext_0f38 } ext;
+    enum {
+        ext_none = vex_none,
+        ext_0f   = vex_0f,
+        ext_0f38 = vex_0f38,
+        ext_0f3a = vex_0f3a,
+    } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
@@ -1651,6 +1668,34 @@ x86_decode_onebyte(
 }
 
 static int
+x86_decode_twobyte(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_OKAY;
+
+    switch ( state->opcode )
+    {
+    case 0x78:
+        if ( vex.opcx )
+            break;
+        switch ( vex.pfx )
+        {
+        case vex_66: /* extrq $imm8, $imm8, xmm */
+        case vex_f2: /* insertq $imm8, $imm8, xmm, xmm */
+            imm1 = insn_fetch_type(uint8_t);
+            imm2 = insn_fetch_type(uint8_t);
+            break;
+        }
+        break;
+    }
+
+ done:
+    return rc;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -1754,6 +1799,10 @@ x86_decode(
                 b = insn_fetch_type(uint8_t);
                 ext = ext_0f38;
                 break;
+            case 0x3a:
+                b = insn_fetch_type(uint8_t);
+                ext = ext_0f3a;
+                break;
             }
         }
     }
@@ -1809,10 +1858,22 @@ x86_decode(
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
-                fail_if(vex.opcx != vex_0f);
-                ext = ext_0f;
                 b = insn_fetch_type(uint8_t);
-                d = twobyte_table[b];
+                switch ( ext = vex.opcx )
+                {
+                case vex_0f:
+                    d = twobyte_table[b];
+                    break;
+                case vex_0f38:
+                    d = twobyte_table[0x38];
+                    break;
+                case vex_0f3a:
+                    d = twobyte_table[0x3a];
+                    break;
+                default:
+                    rc = X86EMUL_UNHANDLEABLE;
+                    goto done;
+                }
 
                 modrm = insn_fetch_type(uint8_t);
                 modrm_mod = (modrm & 0xc0) >> 6;
@@ -1870,9 +1931,12 @@ x86_decode(
             break;
 
         case ext_0f:
+        case ext_0f3a:
             break;
 
         case ext_0f38:
+            if ( vex.opcx )
+                break;
             switch ( b )
             {
             case 0xf0: /* movbe / crc32 */
@@ -2052,7 +2116,11 @@ x86_decode(
         break;
 
     case ext_0f:
+        rc = x86_decode_twobyte(state, ctxt, ops);
+        break;
+
     case ext_0f38:
+    case ext_0f3a:
         break;
 
     default:
@@ -2268,6 +2336,7 @@ x86_emulate(
         goto ext_0f38_insn;
     default:
         ASSERT_UNREACHABLE();
+    case ext_0f3a:
         goto cannot_emulate;
     }
 
@@ -4211,6 +4280,11 @@ x86_emulate(
             goto done;
         break;
 
+    case 0x0b: /* ud2 */
+    case 0xb9: /* ud1 */
+    case 0xff: /* ud0 */
+        generate_exception_if(1, EXC_UD, -1);
+
     case 0x0d: /* GrpP (prefetch) */
     case 0x18: /* Grp16 (prefetch/nop) */
     case 0x19 ... 0x1f: /* nop (amd-defined) */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 05/16] x86emul: add XOP decoding
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (3 preceding siblings ...)
  2016-09-28  8:08 ` [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions Jan Beulich
@ 2016-09-28  8:09 ` Jan Beulich
  2016-09-29  9:07   ` Andrew Cooper
  2016-09-28  8:10 ` [PATCH v2 06/16] x86emul: add EVEX decoding Jan Beulich
                   ` (11 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:09 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 4010 bytes --]

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add a comment.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -279,6 +279,12 @@ static const opcode_desc_t twobyte_table
     ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
+static const opcode_desc_t xop_table[] = {
+    DstReg|SrcImmByte|ModRM,
+    DstReg|SrcMem|ModRM,
+    DstReg|SrcImm|ModRM,
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -1591,6 +1597,13 @@ struct x86_emulate_state {
         ext_0f   = vex_0f,
         ext_0f38 = vex_0f38,
         ext_0f3a = vex_0f3a,
+        /*
+         * For XOP use values such that the respective instruction field
+         * can be used without adjustment.
+         */
+        ext_8f08 = 8,
+        ext_8f09,
+        ext_8f0a,
     } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -1813,7 +1822,7 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1827,11 +1836,11 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX */
+                /* VEX / XOP */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
-                if ( b & 1 )
+                if ( b == 0xc5 )
                 {
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
@@ -1859,18 +1868,30 @@ x86_decode(
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
-                switch ( ext = vex.opcx )
+                ext = vex.opcx;
+                if ( b != 0x8f )
+                {
+                    switch ( ext )
+                    {
+                    case vex_0f:
+                        d = twobyte_table[b];
+                        break;
+                    case vex_0f38:
+                        d = twobyte_table[0x38];
+                        break;
+                    case vex_0f3a:
+                        d = twobyte_table[0x3a];
+                        break;
+                    default:
+                        rc = X86EMUL_UNHANDLEABLE;
+                        goto done;
+                    }
+                }
+                else if ( ext < ext_8f08 +
+                                sizeof(xop_table) / sizeof(*xop_table) )
+                    d = xop_table[ext - ext_8f08];
+                else
                 {
-                case vex_0f:
-                    d = twobyte_table[b];
-                    break;
-                case vex_0f38:
-                    d = twobyte_table[0x38];
-                    break;
-                case vex_0f3a:
-                    d = twobyte_table[0x3a];
-                    break;
-                default:
                     rc = X86EMUL_UNHANDLEABLE;
                     goto done;
                 }
@@ -1932,6 +1953,9 @@ x86_decode(
 
         case ext_0f:
         case ext_0f3a:
+        case ext_8f08:
+        case ext_8f09:
+        case ext_8f0a:
             break;
 
         case ext_0f38:
@@ -2121,6 +2145,9 @@ x86_decode(
 
     case ext_0f38:
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         break;
 
     default:
@@ -2337,6 +2364,9 @@ x86_emulate(
     default:
         ASSERT_UNREACHABLE();
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         goto cannot_emulate;
     }
 




[-- Attachment #2: x86emul-decode-XOP.patch --]
[-- Type: text/plain, Size: 4033 bytes --]

x86emul: add XOP decoding

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add a comment.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -279,6 +279,12 @@ static const opcode_desc_t twobyte_table
     ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
 };
 
+static const opcode_desc_t xop_table[] = {
+    DstReg|SrcImmByte|ModRM,
+    DstReg|SrcMem|ModRM,
+    DstReg|SrcImm|ModRM,
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -1591,6 +1597,13 @@ struct x86_emulate_state {
         ext_0f   = vex_0f,
         ext_0f38 = vex_0f38,
         ext_0f3a = vex_0f3a,
+        /*
+         * For XOP use values such that the respective instruction field
+         * can be used without adjustment.
+         */
+        ext_8f08 = 8,
+        ext_8f09,
+        ext_8f0a,
     } ext;
     uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -1813,7 +1822,7 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1827,11 +1836,11 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX */
+                /* VEX / XOP */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
-                if ( b & 1 )
+                if ( b == 0xc5 )
                 {
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
@@ -1859,18 +1868,30 @@ x86_decode(
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
-                switch ( ext = vex.opcx )
+                ext = vex.opcx;
+                if ( b != 0x8f )
+                {
+                    switch ( ext )
+                    {
+                    case vex_0f:
+                        d = twobyte_table[b];
+                        break;
+                    case vex_0f38:
+                        d = twobyte_table[0x38];
+                        break;
+                    case vex_0f3a:
+                        d = twobyte_table[0x3a];
+                        break;
+                    default:
+                        rc = X86EMUL_UNHANDLEABLE;
+                        goto done;
+                    }
+                }
+                else if ( ext < ext_8f08 +
+                                sizeof(xop_table) / sizeof(*xop_table) )
+                    d = xop_table[ext - ext_8f08];
+                else
                 {
-                case vex_0f:
-                    d = twobyte_table[b];
-                    break;
-                case vex_0f38:
-                    d = twobyte_table[0x38];
-                    break;
-                case vex_0f3a:
-                    d = twobyte_table[0x3a];
-                    break;
-                default:
                     rc = X86EMUL_UNHANDLEABLE;
                     goto done;
                 }
@@ -1932,6 +1953,9 @@ x86_decode(
 
         case ext_0f:
         case ext_0f3a:
+        case ext_8f08:
+        case ext_8f09:
+        case ext_8f0a:
             break;
 
         case ext_0f38:
@@ -2121,6 +2145,9 @@ x86_decode(
 
     case ext_0f38:
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         break;
 
     default:
@@ -2337,6 +2364,9 @@ x86_emulate(
     default:
         ASSERT_UNREACHABLE();
     case ext_0f3a:
+    case ext_8f08:
+    case ext_8f09:
+    case ext_8f0a:
         goto cannot_emulate;
     }
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 06/16] x86emul: add EVEX decoding
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (4 preceding siblings ...)
  2016-09-28  8:09 ` [PATCH v2 05/16] x86emul: add XOP decoding Jan Beulich
@ 2016-09-28  8:10 ` Jan Beulich
  2016-09-29  9:08   ` Andrew Cooper
  2016-09-28  8:12 ` [PATCH v2 07/16] x86emul: generate and make use of a canonical opcode representation Jan Beulich
                   ` (10 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:10 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 2957 bytes --]

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: I'm kind of undecided whether to right away propagate evex.R into
     modrm_reg (and then also deal with the new meaning of evex.x for
     modrm_rm). Since that doesn't affect GPRs (and the extra bits
     would need masking off when accessing GPRs) I've left this out for
     now.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -336,6 +336,27 @@ union vex {
         ptr[1] = rex | REX_PREFIX; \
 } while (0)
 
+union evex {
+    uint8_t raw[3];
+    struct {
+        uint8_t opcx:2;
+        uint8_t :2;
+        uint8_t R:1;
+        uint8_t b:1;
+        uint8_t x:1;
+        uint8_t r:1;
+        uint8_t pfx:2;
+        uint8_t evex:1;
+        uint8_t reg:4;
+        uint8_t w:1;
+        uint8_t opmsk:3;
+        uint8_t RX:1;
+        uint8_t bcst:1;
+        uint8_t lr:2;
+        uint8_t z:1;
+    };
+};
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -1611,6 +1632,7 @@ struct x86_emulate_state {
     bool lock_prefix;
     opcode_desc_t desc;
     union vex vex;
+    union evex evex;
     int override_seg;
 
     /*
@@ -1638,6 +1660,7 @@ struct x86_emulate_state {
 #define rex_prefix (state->rex_prefix)
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
+#define evex (state->evex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
 
@@ -1826,7 +1849,8 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18)) ||
+                      b == 0x62) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1840,7 +1864,7 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX / XOP */
+                /* VEX / XOP / EVEX */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
@@ -1867,6 +1891,14 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
+                    if ( b == 0x62 )
+                    {
+                        evex.raw[0] = vex.raw[0];
+                        evex.raw[1] = vex.raw[1];
+                        evex.raw[2] = insn_fetch_type(uint8_t);
+
+                        vex.opcx = evex.opcx;
+                    }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;




[-- Attachment #2: x86emul-decode-EVEX.patch --]
[-- Type: text/plain, Size: 2981 bytes --]

x86emul: add EVEX decoding

This way we can at least size (and e.g. skip) them if needed, and we
also won't raise the wrong fault due to not having read all relevant
bytes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: I'm kind of undecided whether to right away propagate evex.R into
     modrm_reg (and then also deal with the new meaning of evex.x for
     modrm_rm). Since that doesn't affect GPRs (and the extra bits
     would need masking off when accessing GPRs) I've left this out for
     now.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -336,6 +336,27 @@ union vex {
         ptr[1] = rex | REX_PREFIX; \
 } while (0)
 
+union evex {
+    uint8_t raw[3];
+    struct {
+        uint8_t opcx:2;
+        uint8_t :2;
+        uint8_t R:1;
+        uint8_t b:1;
+        uint8_t x:1;
+        uint8_t r:1;
+        uint8_t pfx:2;
+        uint8_t evex:1;
+        uint8_t reg:4;
+        uint8_t w:1;
+        uint8_t opmsk:3;
+        uint8_t RX:1;
+        uint8_t bcst:1;
+        uint8_t lr:2;
+        uint8_t z:1;
+    };
+};
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -1611,6 +1632,7 @@ struct x86_emulate_state {
     bool lock_prefix;
     opcode_desc_t desc;
     union vex vex;
+    union evex evex;
     int override_seg;
 
     /*
@@ -1638,6 +1660,7 @@ struct x86_emulate_state {
 #define rex_prefix (state->rex_prefix)
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
+#define evex (state->evex)
 #define override_seg (state->override_seg)
 #define ea (state->ea)
 
@@ -1826,7 +1849,8 @@ x86_decode(
         modrm = insn_fetch_type(uint8_t);
         modrm_mod = (modrm & 0xc0) >> 6;
 
-        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18))) )
+        if ( !ext && ((b & ~1) == 0xc4 || (b == 0x8f && (modrm & 0x18)) ||
+                      b == 0x62) )
             switch ( def_ad_bytes )
             {
             default:
@@ -1840,7 +1864,7 @@ x86_decode(
                     break;
                 /* fall through */
             case 8:
-                /* VEX / XOP */
+                /* VEX / XOP / EVEX */
                 generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
 
                 vex.raw[0] = modrm;
@@ -1867,6 +1891,14 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
+                    if ( b == 0x62 )
+                    {
+                        evex.raw[0] = vex.raw[0];
+                        evex.raw[1] = vex.raw[1];
+                        evex.raw[2] = insn_fetch_type(uint8_t);
+
+                        vex.opcx = evex.opcx;
+                    }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 07/16] x86emul: generate and make use of a canonical opcode representation
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (5 preceding siblings ...)
  2016-09-28  8:10 ` [PATCH v2 06/16] x86emul: add EVEX decoding Jan Beulich
@ 2016-09-28  8:12 ` Jan Beulich
  2016-09-29 10:11   ` Andrew Cooper
  2016-09-28  8:13 ` [PATCH v2 08/16] SVM: use generic instruction decoding Jan Beulich
                   ` (9 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:12 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 34066 bytes --]

This representation is then being made available to interested callers,
to facilitate replacing their custom decoding.

This entails combining the three main switch statements into one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend comments. Use uint8_t cast in X86EMUL_OPC(). Rename
    X86EMUL_OPC_KIND_MASK to X86EMUL_OPC_ENCODING_MASK. Add
    X86EMUL_OPC_LEGACY_.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -14,6 +14,9 @@ typedef bool bool_t;
 #define ASSERT assert
 #define ASSERT_UNREACHABLE() assert(!__LINE__)
 
+#define MASK_EXTR(v, m) (((v) & (m)) / ((m) & -(m)))
+#define MASK_INSR(v, m) (((v) * ((m) & -(m))) & (m))
+
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1626,7 +1626,6 @@ struct x86_emulate_state {
         ext_8f09,
         ext_8f0a,
     } ext;
-    uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
     bool lock_prefix;
@@ -1672,7 +1671,7 @@ x86_decode_onebyte(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode )
     {
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
@@ -1711,11 +1710,9 @@ x86_decode_twobyte(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
     {
     case 0x78:
-        if ( vex.opcx )
-            break;
         switch ( vex.pfx )
         {
         case vex_66: /* extrq $imm8, $imm8, xmm */
@@ -1724,7 +1721,23 @@ x86_decode_twobyte(
             imm2 = insn_fetch_type(uint8_t);
             break;
         }
-        break;
+        /* fall through */
+    case 0x10 ... 0x18:
+    case 0x28 ... 0x2f:
+    case 0x50 ... 0x77:
+    case 0x79 ... 0x7f:
+    case 0xae:
+    case 0xc2:
+    case 0xc4 ... 0xc7:
+    case 0xd0 ... 0xfe:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+        /* Intentionally not handling here despite being modified by F3:
+    case 0xb8: jmpe / popcnt
+    case 0xbc: bsf / tzcnt
+    case 0xbd: bsr / lzcnt
+         * They're being dealt with in the execution phase (if at all).
+         */
     }
 
  done:
@@ -1732,13 +1745,35 @@ x86_decode_twobyte(
 }
 
 static int
+x86_decode_0f38(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+    {
+    case 0x00 ... 0xef:
+    case 0xf2 ... 0xff:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
+    case 0xf0: case 0xf1: /* movbe / crc32 */
+        if ( rep_prefix() )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
     uint8_t b, d, sib, sib_index, sib_base;
-    unsigned int def_op_bytes, def_ad_bytes;
+    unsigned int def_op_bytes, def_ad_bytes, opcode;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
@@ -1819,29 +1854,31 @@ x86_decode(
 
     /* Opcode byte(s). */
     d = opcode_table[b];
-    if ( d == 0 )
+    if ( d == 0 && b == 0x0f)
     {
-        /* Two-byte opcode? */
-        if ( b == 0x0f )
+        /* Two-byte opcode. */
+        b = insn_fetch_type(uint8_t);
+        d = twobyte_table[b];
+        switch ( b )
         {
+        default:
+            opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f;
+            break;
+        case 0x38:
             b = insn_fetch_type(uint8_t);
-            d = twobyte_table[b];
-            switch ( b )
-            {
-            default:
-                ext = ext_0f;
-                break;
-            case 0x38:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f38;
-                break;
-            case 0x3a:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f3a;
-                break;
-            }
+            opcode = b | MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f38;
+            break;
+        case 0x3a:
+            b = insn_fetch_type(uint8_t);
+            opcode = b | MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f3a;
+            break;
         }
     }
+    else
+        opcode = b;
 
     /* ModRM and SIB bytes. */
     if ( d & ModRM )
@@ -1870,6 +1907,7 @@ x86_decode(
                 vex.raw[0] = modrm;
                 if ( b == 0xc5 )
                 {
+                    opcode = X86EMUL_OPC_VEX_;
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
                     vex.x = 1;
@@ -1891,31 +1929,44 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
-                    if ( b == 0x62 )
+                    switch ( b )
                     {
+                    case 0x62:
+                        opcode = X86EMUL_OPC_EVEX_;
                         evex.raw[0] = vex.raw[0];
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
                         vex.opcx = evex.opcx;
+                        break;
+                    case 0xc4:
+                        opcode = X86EMUL_OPC_VEX_;
+                        break;
+                    default:
+                        opcode = 0;
+                        break;
                     }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
+                opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
                 ext = vex.opcx;
                 if ( b != 0x8f )
                 {
                     switch ( ext )
                     {
                     case vex_0f:
+                        opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[b];
                         break;
                     case vex_0f38:
+                        opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x38];
                         break;
                     case vex_0f3a:
+                        opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x3a];
                         break;
                     default:
@@ -1925,7 +1976,11 @@ x86_decode(
                 }
                 else if ( ext < ext_8f08 +
                                 sizeof(xop_table) / sizeof(*xop_table) )
+                {
+                    opcode |= MASK_INSR(0x8f08 + ext - ext_8f08,
+                                        X86EMUL_OPC_EXT_MASK);
                     d = xop_table[ext - ext_8f08];
+                }
                 else
                 {
                     rc = X86EMUL_UNHANDLEABLE;
@@ -1995,9 +2050,7 @@ x86_decode(
             break;
 
         case ext_0f38:
-            if ( vex.opcx )
-                break;
-            switch ( b )
+            switch ( opcode & X86EMUL_OPC_MASK )
             {
             case 0xf0: /* movbe / crc32 */
                 d |= repne_prefix() ? ByteOp : Mov;
@@ -2006,8 +2059,6 @@ x86_decode(
                 if ( !repne_prefix() )
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
-            default: /* Until it is worth making this table based ... */
-                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -2166,7 +2217,7 @@ x86_decode(
         break;
     }
 
-    state->opcode = b;
+    ctxt->opcode = opcode;
     state->desc = d;
 
     switch ( ext )
@@ -2180,7 +2231,14 @@ x86_decode(
         break;
 
     case ext_0f38:
+        rc = x86_decode_0f38(state, ctxt, ops);
+        break;
+
     case ext_0f3a:
+        if ( !vex.opcx )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
     case ext_8f08:
     case ext_8f09:
     case ext_8f0a:
@@ -2222,7 +2280,7 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
-    b = state.opcode;
+    b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
 
@@ -2389,24 +2447,7 @@ x86_emulate(
         break;
     }
 
-    switch ( ext )
-    {
-    case ext_none:
-        break;
-    case ext_0f:
-        goto ext_0f_insn;
-    case ext_0f38:
-        goto ext_0f38_insn;
-    default:
-        ASSERT_UNREACHABLE();
-    case ext_0f3a:
-    case ext_8f08:
-    case ext_8f09:
-    case ext_8f0a:
-        goto cannot_emulate;
-    }
-
-    switch ( b )
+    switch ( ctxt->opcode )
     {
         struct segment_register cs;
 
@@ -4108,15 +4149,7 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f_insn:
-    switch ( b )
-    {
-    case 0x00: /* Grp6 */
+    case X86EMUL_OPC(0x0f, 0x00): /* Grp6 */
         fail_if((modrm_reg & 6) != 2);
         generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4125,7 +4158,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x01: /* Grp7 */ {
+    case X86EMUL_OPC(0x0f, 0x01): /* Grp7 */ {
         struct segment_register reg;
         unsigned long base, limit, cr0, cr0w;
 
@@ -4270,7 +4303,7 @@ x86_emulate(
         break;
     }
 
-    case 0x05: /* syscall */ {
+    case X86EMUL_OPC(0x0f, 0x05): /* syscall */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
 
@@ -4330,7 +4363,7 @@ x86_emulate(
         break;
     }
 
-    case 0x06: /* clts */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL));
         if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ||
@@ -4338,42 +4371,64 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x08: /* invd */
-    case 0x09: /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x08): /* invd */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->wbinvd == NULL);
         if ( (rc = ops->wbinvd(ctxt)) != 0 )
             goto done;
         break;
 
-    case 0x0b: /* ud2 */
-    case 0xb9: /* ud1 */
-    case 0xff: /* ud0 */
+    case X86EMUL_OPC(0x0f, 0x0b): /* ud2 */
+    case X86EMUL_OPC(0x0f, 0xb9): /* ud1 */
+    case X86EMUL_OPC(0x0f, 0xff): /* ud0 */
         generate_exception_if(1, EXC_UD, -1);
 
-    case 0x0d: /* GrpP (prefetch) */
-    case 0x18: /* Grp16 (prefetch/nop) */
-    case 0x19 ... 0x1f: /* nop (amd-defined) */
+    case X86EMUL_OPC(0x0f, 0x0d): /* GrpP (prefetch) */
+    case X86EMUL_OPC(0x0f, 0x18): /* Grp16 (prefetch/nop) */
+    case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
-    case 0x2b: /* {,v}movntp{s,d} xmm,m128 */
-               /* vmovntp{s,d} ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0x2b):        /* movntps xmm,m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* vmovntps xmm,m128 */
+                                         /* vmovntps ymm,m256 */
+    case X86EMUL_OPC_66(0x0f, 0x2b):     /* movntpd xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
+                                         /* vmovntpd ymm,m256 */
         fail_if(ea.type != OP_MEM);
         /* fall through */
-    case 0x28: /* {,v}movap{s,d} xmm/m128,xmm */
-               /* vmovap{s,d} ymm/m256,ymm */
-    case 0x29: /* {,v}movap{s,d} xmm,xmm/m128 */
-               /* vmovap{s,d} ymm,ymm/m256 */
-        fail_if(vex.pfx & VEX_PREFIX_SCALAR_MASK);
-        /* fall through */
-    case 0x10: /* {,v}movup{s,d} xmm/m128,xmm */
-               /* vmovup{s,d} ymm/m256,ymm */
-               /* {,v}movss xmm/m32,xmm */
-               /* {,v}movsd xmm/m64,xmm */
-    case 0x11: /* {,v}movup{s,d} xmm,xmm/m128 */
-               /* vmovup{s,d} ymm,ymm/m256 */
-               /* {,v}movss xmm,xmm/m32 */
-               /* {,v}movsd xmm,xmm/m64 */
+    case X86EMUL_OPC(0x0f, 0x28):        /* movaps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x28):    /* vmovaps xmm/m128,xmm */
+                                         /* vmovaps ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x28):     /* movapd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
+                                         /* vmovapd ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x29):        /* movaps xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* vmovaps xmm,xmm/m128 */
+                                         /* vmovaps ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x29):     /* movapd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
+                                         /* vmovapd ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x10):        /* movups xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x10):    /* vmovups xmm/m128,xmm */
+                                         /* vmovups ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x10):     /* movupd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
+                                         /* vmovupd ymm/m256,ymm */
+    case X86EMUL_OPC_F3(0x0f, 0x10):     /* movss xmm/m32,xmm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x10):     /* movsd xmm/m64,xmm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
+    case X86EMUL_OPC(0x0f, 0x11):        /* movups xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* vmovups xmm,xmm/m128 */
+                                         /* vmovups ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x11):     /* movupd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
+                                         /* vmovupd ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x11):     /* movss xmm,xmm/m32 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
+    case X86EMUL_OPC_F2(0x0f, 0x11):     /* movsd xmm,xmm/m64 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4396,10 +4451,9 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) ||
-                    ((vex.reg != 0xf) &&
-                     ((ea.type == OP_MEM) ||
-                      !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
+            fail_if((vex.reg != 0xf) &&
+                    ((ea.type == OP_MEM) ||
+                     !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4437,10 +4491,10 @@ x86_emulate(
         break;
     }
 
-    case 0x20: /* mov cr,reg */
-    case 0x21: /* mov dr,reg */
-    case 0x22: /* mov reg,cr */
-    case 0x23: /* mov reg,dr */
+    case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
+    case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
+    case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
+    case X86EMUL_OPC(0x0f, 0x23): /* mov reg,dr */
         generate_exception_if(ea.type != OP_REG, EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         modrm_reg |= lock_prefix << 3;
@@ -4476,7 +4530,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x30: /* wrmsr */ {
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ {
         uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);
@@ -4485,7 +4539,7 @@ x86_emulate(
         break;
     }
 
-    case 0x31: rdtsc: /* rdtsc */ {
+    case X86EMUL_OPC(0x0f, 0x31): rdtsc: /* rdtsc */ {
         unsigned long cr4;
         uint64_t val;
         if ( !mode_ring0() )
@@ -4503,7 +4557,7 @@ x86_emulate(
         break;
     }
 
-    case 0x32: /* rdmsr */ {
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ {
         uint64_t val;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->read_msr == NULL);
@@ -4514,13 +4568,13 @@ x86_emulate(
         break;
     }
 
-    case 0x40 ... 0x4f: /* cmovcc */
+    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
         dst.val = src.val;
         if ( !test_cc(b, _regs.eflags) )
             dst.type = OP_NONE;
         break;
 
-    case 0x34: /* sysenter */ {
+    case X86EMUL_OPC(0x0f, 0x34): /* sysenter */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         int lm;
@@ -4568,7 +4622,7 @@ x86_emulate(
         break;
     }
 
-    case 0x35: /* sysexit */ {
+    case X86EMUL_OPC(0x0f, 0x35): /* sysexit */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         bool_t user64 = !!(rex_prefix & REX_W);
@@ -4607,18 +4661,26 @@ x86_emulate(
         break;
     }
 
-    case 0xe7: /* movntq mm,m64 */
-               /* {,v}movntdq xmm,m128 */
-               /* vmovntdq ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
+    case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
+                                         /* vmovntdq ymm,m256 */
         fail_if(ea.type != OP_MEM);
-        fail_if(vex.pfx == vex_f3);
         /* fall through */
-    case 0x6f: /* movq mm/m64,mm */
-               /* {,v}movdq{a,u} xmm/m128,xmm */
-               /* vmovdq{a,u} ymm/m256,ymm */
-    case 0x7f: /* movq mm,mm/m64 */
-               /* {,v}movdq{a,u} xmm,xmm/m128 */
-               /* vmovdq{a,u} ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x6f):        /* movq mm/m64,mm */
+    case X86EMUL_OPC_66(0x0f, 0x6f):     /* movdqa xmm/m128,xmm */
+    case X86EMUL_OPC_F3(0x0f, 0x6f):     /* movdqu xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
+                                         /* vmovdqa ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
+                                         /* vmovdqu ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
+    case X86EMUL_OPC_66(0x0f, 0x7f):     /* movdqa xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
+                                         /* vmovdqa ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x7f):     /* movdqu xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
+                                         /* vmovdqu ymm,ymm/m256 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4654,8 +4716,7 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
-                    ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
+            fail_if(vex.reg != 0xf);
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4691,24 +4752,24 @@ x86_emulate(
         break;
     }
 
-    case 0x80 ... 0x8f: /* jcc (near) */
+    case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
         break;
 
-    case 0x90 ... 0x9f: /* setcc */
+    case X86EMUL_OPC(0x0f, 0x90) ... X86EMUL_OPC(0x0f, 0x9f): /* setcc */
         dst.val = test_cc(b, _regs.eflags);
         break;
 
-    case 0xa0: /* push %%fs */
+    case X86EMUL_OPC(0x0f, 0xa0): /* push %%fs */
         src.val = x86_seg_fs;
         goto push_seg;
 
-    case 0xa1: /* pop %%fs */
+    case X86EMUL_OPC(0x0f, 0xa1): /* pop %%fs */
         src.val = x86_seg_fs;
         goto pop_seg;
 
-    case 0xa2: /* cpuid */ {
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ {
         unsigned int eax = _regs.eax, ebx = _regs.ebx;
         unsigned int ecx = _regs.ecx, edx = _regs.edx;
         fail_if(ops->cpuid == NULL);
@@ -4719,15 +4780,15 @@ x86_emulate(
         break;
     }
 
-    case 0xa3: bt: /* bt */
+    case X86EMUL_OPC(0x0f, 0xa3): bt: /* bt */
         emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
 
-    case 0xa4: /* shld imm8,r,r/m */
-    case 0xa5: /* shld %%cl,r,r/m */
-    case 0xac: /* shrd imm8,r,r/m */
-    case 0xad: /* shrd %%cl,r,r/m */ {
+    case X86EMUL_OPC(0x0f, 0xa4): /* shld imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xa5): /* shld %%cl,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xac): /* shrd imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xad): /* shrd %%cl,r,r/m */ {
         uint8_t shift, width = dst.bytes << 3;
 
         generate_exception_if(lock_prefix, EXC_UD, -1);
@@ -4762,24 +4823,23 @@ x86_emulate(
         break;
     }
 
-    case 0xa8: /* push %%gs */
+    case X86EMUL_OPC(0x0f, 0xa8): /* push %%gs */
         src.val = x86_seg_gs;
         goto push_seg;
 
-    case 0xa9: /* pop %%gs */
+    case X86EMUL_OPC(0x0f, 0xa9): /* pop %%gs */
         src.val = x86_seg_gs;
         goto pop_seg;
 
-    case 0xab: bts: /* bts */
+    case X86EMUL_OPC(0x0f, 0xab): bts: /* bts */
         emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
         break;
 
-    case 0xae: /* Grp15 */
+    case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
         case 7: /* clflush{,opt} */
             fail_if(modrm_mod == 3);
-            fail_if(rep_prefix());
             fail_if(ops->wbinvd == NULL);
             if ( (rc = ops->wbinvd(ctxt)) != 0 )
                 goto done;
@@ -4789,11 +4849,11 @@ x86_emulate(
         }
         break;
 
-    case 0xaf: /* imul */
+    case X86EMUL_OPC(0x0f, 0xaf): /* imul */
         emulate_2op_SrcV_srcmem("imul", src, dst, _regs.eflags);
         break;
 
-    case 0xb0 ... 0xb1: /* cmpxchg */
+    case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
         /* Save real source value, then compare EAX against destination. */
         src.orig_val = src.val;
         src.val = _regs.eax;
@@ -4812,34 +4872,34 @@ x86_emulate(
         }
         break;
 
-    case 0xb2: /* lss */
+    case X86EMUL_OPC(0x0f, 0xb2): /* lss */
         dst.val = x86_seg_ss;
         goto les;
 
-    case 0xb3: btr: /* btr */
+    case X86EMUL_OPC(0x0f, 0xb3): btr: /* btr */
         emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
         break;
 
-    case 0xb4: /* lfs */
+    case X86EMUL_OPC(0x0f, 0xb4): /* lfs */
         dst.val = x86_seg_fs;
         goto les;
 
-    case 0xb5: /* lgs */
+    case X86EMUL_OPC(0x0f, 0xb5): /* lgs */
         dst.val = x86_seg_gs;
         goto les;
 
-    case 0xb6: /* movzx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb6): /* movzx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (uint8_t)src.val;
         break;
 
-    case 0xb7: /* movzx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb7): /* movzx rm16,r{16,32,64} */
         dst.val = (uint16_t)src.val;
         break;
 
-    case 0xba: /* Grp8 */
+    case X86EMUL_OPC(0x0f, 0xba): /* Grp8 */
         switch ( modrm_reg & 7 )
         {
         case 4: goto bt;
@@ -4850,11 +4910,11 @@ x86_emulate(
         }
         break;
 
-    case 0xbb: btc: /* btc */
+    case X86EMUL_OPC(0x0f, 0xbb): btc: /* btc */
         emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
         break;
 
-    case 0xbc: /* bsf or tzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbc): /* bsf or tzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4886,7 +4946,7 @@ x86_emulate(
         break;
     }
 
-    case 0xbd: /* bsr or lzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbd): /* bsr or lzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4922,18 +4982,18 @@ x86_emulate(
         break;
     }
 
-    case 0xbe: /* movsx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbe): /* movsx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (int8_t)src.val;
         break;
 
-    case 0xbf: /* movsx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbf): /* movsx rm16,r{16,32,64} */
         dst.val = (int16_t)src.val;
         break;
 
-    case 0xc0 ... 0xc1: /* xadd */
+    case X86EMUL_OPC(0x0f, 0xc0): case X86EMUL_OPC(0x0f, 0xc1): /* xadd */
         /* Write back the register source. */
         switch ( dst.bytes )
         {
@@ -4944,14 +5004,14 @@ x86_emulate(
         }
         goto add;
 
-    case 0xc3: /* movnti */
+    case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have_sse2();
         generate_exception_if(dst.bytes <= 2, EXC_UD, -1);
         dst.val = src.val;
         break;
 
-    case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
+    case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
         unsigned long old[2], exp[2], new[2];
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD, -1);
@@ -4995,7 +5055,7 @@ x86_emulate(
         break;
     }
 
-    case 0xc8 ... 0xcf: /* bswap */
+    case X86EMUL_OPC(0x0f, 0xc8) ... X86EMUL_OPC(0x0f, 0xcf): /* bswap */
         dst.type = OP_REG;
         dst.reg  = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
@@ -5016,72 +5076,57 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f38_insn:
-    switch ( b )
-    {
-    case 0xf0: case 0xf1: /* movbe / crc32 */
-        generate_exception_if(repe_prefix(), EXC_UD, -1);
-        if ( repne_prefix() )
+    case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
+    case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
+        vcpu_must_have_movbe();
+        switch ( op_bytes )
         {
-            /* crc32 */
-#ifdef HAVE_GAS_SSE4_2
-            host_and_vcpu_must_have(sse4_2);
-            dst.bytes = rex_prefix & REX_W ? 8 : 4;
-            switch ( op_bytes )
-            {
-            case 1:
-                asm ( "crc32b %1,%k0" : "+r" (dst.val)
-                                      : "qm" (*(uint8_t *)&src.val) );
-                break;
-            case 2:
-                asm ( "crc32w %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint16_t *)&src.val) );
-                break;
-            case 4:
-                asm ( "crc32l %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint32_t *)&src.val) );
-                break;
-# ifdef __x86_64__
-            case 8:
-                asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
-                break;
-# endif
-            default:
-                ASSERT_UNREACHABLE();
-            }
-#else /* !HAVE_GAS_SSE4_2 */
-            goto cannot_emulate;
+        case 2:
+            asm ( "xchg %h0,%b0" : "=Q" (dst.val)
+                                 : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 4:
+#ifdef __x86_64__
+            asm ( "bswap %k0" : "=r" (dst.val)
+                              : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 8:
 #endif
+            asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
+            break;
+        default:
+            ASSERT_UNREACHABLE();
         }
-        else
+        break;
+#ifdef HAVE_GAS_SSE4_2
+    case X86EMUL_OPC_F2(0x0f38, 0xf0): /* crc32 r/m8, r{32,64} */
+    case X86EMUL_OPC_F2(0x0f38, 0xf1): /* crc32 r/m{16,32,64}, r{32,64} */
+        host_and_vcpu_must_have(sse4_2);
+        dst.bytes = rex_prefix & REX_W ? 8 : 4;
+        switch ( op_bytes )
         {
-            /* movbe */
-            vcpu_must_have_movbe();
-            switch ( op_bytes )
-            {
-            case 2:
-                asm ( "xchg %h0,%b0" : "=Q" (dst.val)
-                                     : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 4:
-#ifdef __x86_64__
-                asm ( "bswap %k0" : "=r" (dst.val)
-                                  : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 8:
-#endif
-                asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
-                break;
-            default:
-                ASSERT_UNREACHABLE();
-            }
+        case 1:
+            asm ( "crc32b %1,%k0" : "+r" (dst.val)
+                                  : "qm" (*(uint8_t *)&src.val) );
+            break;
+        case 2:
+            asm ( "crc32w %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint16_t *)&src.val) );
+            break;
+        case 4:
+            asm ( "crc32l %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint32_t *)&src.val) );
+            break;
+# ifdef __x86_64__
+        case 8:
+            asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
+            break;
+# endif
+        default:
+            ASSERT_UNREACHABLE();
         }
         break;
+#endif
     default:
         goto cannot_emulate;
     }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -415,12 +415,15 @@ struct x86_emulate_ctxt
     /* Stack pointer width in bits (16, 32 or 64). */
     unsigned int sp_size;
 
-    /* Set this if writes may have side effects. */
-    uint8_t force_writeback;
+    /* Canonical opcode (see below). */
+    unsigned int opcode;
 
     /* Software event injection support. */
     enum x86_swint_emulation swint_emulate;
 
+    /* Set this if writes may have side effects. */
+    uint8_t force_writeback;
+
     /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */
     union {
         struct {
@@ -435,6 +438,60 @@ struct x86_emulate_ctxt
     void *data;
 };
 
+/*
+ * Encode opcode extensions in the following way:
+ *     0x0xxxx for one byte opcodes
+ *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
+ *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
+ *  0x8f0axxxx for 8f/a-prefixed XOP opcodes
+ * The low byte represents the base opcode withing the resepctive space,
+ * and some of bits 8..15 are used for encoding further information (see
+ * below).
+ * Hence no separate #define-s get added.
+ */
+#define X86EMUL_OPC_EXT_MASK         0xffff0000
+#define X86EMUL_OPC(ext, byte)       ((uint8_t)(byte) | \
+                                      MASK_INSR((ext), X86EMUL_OPC_EXT_MASK))
+/*
+ * This includes the 66, F3, and F2 prefixes (see also below)
+ * as well as VEX/EVEX:
+ */
+#define X86EMUL_OPC_MASK             (0x000000ff | X86EMUL_OPC_PFX_MASK | \
+                                     X86EMUL_OPC_ENCODING_MASK)
+
+/*
+ * Note that prefixes 66, F2, and F3 get encoded only when semantically
+ * meaningful, to reduce the complexity of interpreting this representation.
+ */
+#define X86EMUL_OPC_PFX_MASK         0x00000300
+# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
+# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
+# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
+
+#define X86EMUL_OPC_ENCODING_MASK    0x00003000
+#define X86EMUL_OPC_LEGACY_          0x00000000
+#define X86EMUL_OPC_VEX_             0x00001000
+# define X86EMUL_OPC_VEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
+#define X86EMUL_OPC_EVEX_            0x00002000
+# define X86EMUL_OPC_EVEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
+
 struct x86_emulate_stub {
     union {
         void (*func)(void);



[-- Attachment #2: x86emul-opcode-canon.patch --]
[-- Type: text/plain, Size: 34133 bytes --]

x86emul: generate and make use of a canonical opcode representation

This representation is then being made available to interested callers,
to facilitate replacing their custom decoding.

This entails combining the three main switch statements into one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend comments. Use uint8_t cast in X86EMUL_OPC(). Rename
    X86EMUL_OPC_KIND_MASK to X86EMUL_OPC_ENCODING_MASK. Add
    X86EMUL_OPC_LEGACY_.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -14,6 +14,9 @@ typedef bool bool_t;
 #define ASSERT assert
 #define ASSERT_UNREACHABLE() assert(!__LINE__)
 
+#define MASK_EXTR(v, m) (((v) & (m)) / ((m) & -(m)))
+#define MASK_INSR(v, m) (((v) * ((m) & -(m))) & (m))
+
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1626,7 +1626,6 @@ struct x86_emulate_state {
         ext_8f09,
         ext_8f0a,
     } ext;
-    uint8_t opcode;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t rex_prefix;
     bool lock_prefix;
@@ -1672,7 +1671,7 @@ x86_decode_onebyte(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode )
     {
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
@@ -1711,11 +1710,9 @@ x86_decode_twobyte(
 {
     int rc = X86EMUL_OKAY;
 
-    switch ( state->opcode )
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
     {
     case 0x78:
-        if ( vex.opcx )
-            break;
         switch ( vex.pfx )
         {
         case vex_66: /* extrq $imm8, $imm8, xmm */
@@ -1724,7 +1721,23 @@ x86_decode_twobyte(
             imm2 = insn_fetch_type(uint8_t);
             break;
         }
-        break;
+        /* fall through */
+    case 0x10 ... 0x18:
+    case 0x28 ... 0x2f:
+    case 0x50 ... 0x77:
+    case 0x79 ... 0x7f:
+    case 0xae:
+    case 0xc2:
+    case 0xc4 ... 0xc7:
+    case 0xd0 ... 0xfe:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+        /* Intentionally not handling here despite being modified by F3:
+    case 0xb8: jmpe / popcnt
+    case 0xbc: bsf / tzcnt
+    case 0xbd: bsr / lzcnt
+         * They're being dealt with in the execution phase (if at all).
+         */
     }
 
  done:
@@ -1732,13 +1745,35 @@ x86_decode_twobyte(
 }
 
 static int
+x86_decode_0f38(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+    {
+    case 0x00 ... 0xef:
+    case 0xf2 ... 0xff:
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
+    case 0xf0: case 0xf1: /* movbe / crc32 */
+        if ( rep_prefix() )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
     uint8_t b, d, sib, sib_index, sib_base;
-    unsigned int def_op_bytes, def_ad_bytes;
+    unsigned int def_op_bytes, def_ad_bytes, opcode;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
@@ -1819,29 +1854,31 @@ x86_decode(
 
     /* Opcode byte(s). */
     d = opcode_table[b];
-    if ( d == 0 )
+    if ( d == 0 && b == 0x0f)
     {
-        /* Two-byte opcode? */
-        if ( b == 0x0f )
+        /* Two-byte opcode. */
+        b = insn_fetch_type(uint8_t);
+        d = twobyte_table[b];
+        switch ( b )
         {
+        default:
+            opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f;
+            break;
+        case 0x38:
             b = insn_fetch_type(uint8_t);
-            d = twobyte_table[b];
-            switch ( b )
-            {
-            default:
-                ext = ext_0f;
-                break;
-            case 0x38:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f38;
-                break;
-            case 0x3a:
-                b = insn_fetch_type(uint8_t);
-                ext = ext_0f3a;
-                break;
-            }
+            opcode = b | MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f38;
+            break;
+        case 0x3a:
+            b = insn_fetch_type(uint8_t);
+            opcode = b | MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
+            ext = ext_0f3a;
+            break;
         }
     }
+    else
+        opcode = b;
 
     /* ModRM and SIB bytes. */
     if ( d & ModRM )
@@ -1870,6 +1907,7 @@ x86_decode(
                 vex.raw[0] = modrm;
                 if ( b == 0xc5 )
                 {
+                    opcode = X86EMUL_OPC_VEX_;
                     vex.raw[1] = modrm;
                     vex.opcx = vex_0f;
                     vex.x = 1;
@@ -1891,31 +1929,44 @@ x86_decode(
                             op_bytes = 8;
                         }
                     }
-                    if ( b == 0x62 )
+                    switch ( b )
                     {
+                    case 0x62:
+                        opcode = X86EMUL_OPC_EVEX_;
                         evex.raw[0] = vex.raw[0];
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
                         vex.opcx = evex.opcx;
+                        break;
+                    case 0xc4:
+                        opcode = X86EMUL_OPC_VEX_;
+                        break;
+                    default:
+                        opcode = 0;
+                        break;
                     }
                 }
                 if ( mode_64bit() && !vex.r )
                     rex_prefix |= REX_R;
 
                 b = insn_fetch_type(uint8_t);
+                opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
                 ext = vex.opcx;
                 if ( b != 0x8f )
                 {
                     switch ( ext )
                     {
                     case vex_0f:
+                        opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[b];
                         break;
                     case vex_0f38:
+                        opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x38];
                         break;
                     case vex_0f3a:
+                        opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x3a];
                         break;
                     default:
@@ -1925,7 +1976,11 @@ x86_decode(
                 }
                 else if ( ext < ext_8f08 +
                                 sizeof(xop_table) / sizeof(*xop_table) )
+                {
+                    opcode |= MASK_INSR(0x8f08 + ext - ext_8f08,
+                                        X86EMUL_OPC_EXT_MASK);
                     d = xop_table[ext - ext_8f08];
+                }
                 else
                 {
                     rc = X86EMUL_UNHANDLEABLE;
@@ -1995,9 +2050,7 @@ x86_decode(
             break;
 
         case ext_0f38:
-            if ( vex.opcx )
-                break;
-            switch ( b )
+            switch ( opcode & X86EMUL_OPC_MASK )
             {
             case 0xf0: /* movbe / crc32 */
                 d |= repne_prefix() ? ByteOp : Mov;
@@ -2006,8 +2059,6 @@ x86_decode(
                 if ( !repne_prefix() )
                     d = (d & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
                 break;
-            default: /* Until it is worth making this table based ... */
-                return X86EMUL_UNHANDLEABLE;
             }
             break;
 
@@ -2166,7 +2217,7 @@ x86_decode(
         break;
     }
 
-    state->opcode = b;
+    ctxt->opcode = opcode;
     state->desc = d;
 
     switch ( ext )
@@ -2180,7 +2231,14 @@ x86_decode(
         break;
 
     case ext_0f38:
+        rc = x86_decode_0f38(state, ctxt, ops);
+        break;
+
     case ext_0f3a:
+        if ( !vex.opcx )
+            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
     case ext_8f08:
     case ext_8f09:
     case ext_8f0a:
@@ -2222,7 +2280,7 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
-    b = state.opcode;
+    b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
 
@@ -2389,24 +2447,7 @@ x86_emulate(
         break;
     }
 
-    switch ( ext )
-    {
-    case ext_none:
-        break;
-    case ext_0f:
-        goto ext_0f_insn;
-    case ext_0f38:
-        goto ext_0f38_insn;
-    default:
-        ASSERT_UNREACHABLE();
-    case ext_0f3a:
-    case ext_8f08:
-    case ext_8f09:
-    case ext_8f0a:
-        goto cannot_emulate;
-    }
-
-    switch ( b )
+    switch ( ctxt->opcode )
     {
         struct segment_register cs;
 
@@ -4108,15 +4149,7 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f_insn:
-    switch ( b )
-    {
-    case 0x00: /* Grp6 */
+    case X86EMUL_OPC(0x0f, 0x00): /* Grp6 */
         fail_if((modrm_reg & 6) != 2);
         generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4125,7 +4158,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x01: /* Grp7 */ {
+    case X86EMUL_OPC(0x0f, 0x01): /* Grp7 */ {
         struct segment_register reg;
         unsigned long base, limit, cr0, cr0w;
 
@@ -4270,7 +4303,7 @@ x86_emulate(
         break;
     }
 
-    case 0x05: /* syscall */ {
+    case X86EMUL_OPC(0x0f, 0x05): /* syscall */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
 
@@ -4330,7 +4363,7 @@ x86_emulate(
         break;
     }
 
-    case 0x06: /* clts */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL));
         if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ||
@@ -4338,42 +4371,64 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x08: /* invd */
-    case 0x09: /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x08): /* invd */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->wbinvd == NULL);
         if ( (rc = ops->wbinvd(ctxt)) != 0 )
             goto done;
         break;
 
-    case 0x0b: /* ud2 */
-    case 0xb9: /* ud1 */
-    case 0xff: /* ud0 */
+    case X86EMUL_OPC(0x0f, 0x0b): /* ud2 */
+    case X86EMUL_OPC(0x0f, 0xb9): /* ud1 */
+    case X86EMUL_OPC(0x0f, 0xff): /* ud0 */
         generate_exception_if(1, EXC_UD, -1);
 
-    case 0x0d: /* GrpP (prefetch) */
-    case 0x18: /* Grp16 (prefetch/nop) */
-    case 0x19 ... 0x1f: /* nop (amd-defined) */
+    case X86EMUL_OPC(0x0f, 0x0d): /* GrpP (prefetch) */
+    case X86EMUL_OPC(0x0f, 0x18): /* Grp16 (prefetch/nop) */
+    case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
-    case 0x2b: /* {,v}movntp{s,d} xmm,m128 */
-               /* vmovntp{s,d} ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0x2b):        /* movntps xmm,m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* vmovntps xmm,m128 */
+                                         /* vmovntps ymm,m256 */
+    case X86EMUL_OPC_66(0x0f, 0x2b):     /* movntpd xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
+                                         /* vmovntpd ymm,m256 */
         fail_if(ea.type != OP_MEM);
         /* fall through */
-    case 0x28: /* {,v}movap{s,d} xmm/m128,xmm */
-               /* vmovap{s,d} ymm/m256,ymm */
-    case 0x29: /* {,v}movap{s,d} xmm,xmm/m128 */
-               /* vmovap{s,d} ymm,ymm/m256 */
-        fail_if(vex.pfx & VEX_PREFIX_SCALAR_MASK);
-        /* fall through */
-    case 0x10: /* {,v}movup{s,d} xmm/m128,xmm */
-               /* vmovup{s,d} ymm/m256,ymm */
-               /* {,v}movss xmm/m32,xmm */
-               /* {,v}movsd xmm/m64,xmm */
-    case 0x11: /* {,v}movup{s,d} xmm,xmm/m128 */
-               /* vmovup{s,d} ymm,ymm/m256 */
-               /* {,v}movss xmm,xmm/m32 */
-               /* {,v}movsd xmm,xmm/m64 */
+    case X86EMUL_OPC(0x0f, 0x28):        /* movaps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x28):    /* vmovaps xmm/m128,xmm */
+                                         /* vmovaps ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x28):     /* movapd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
+                                         /* vmovapd ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x29):        /* movaps xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* vmovaps xmm,xmm/m128 */
+                                         /* vmovaps ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x29):     /* movapd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
+                                         /* vmovapd ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x10):        /* movups xmm/m128,xmm */
+    case X86EMUL_OPC_VEX(0x0f, 0x10):    /* vmovups xmm/m128,xmm */
+                                         /* vmovups ymm/m256,ymm */
+    case X86EMUL_OPC_66(0x0f, 0x10):     /* movupd xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
+                                         /* vmovupd ymm/m256,ymm */
+    case X86EMUL_OPC_F3(0x0f, 0x10):     /* movss xmm/m32,xmm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x10):     /* movsd xmm/m64,xmm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
+    case X86EMUL_OPC(0x0f, 0x11):        /* movups xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* vmovups xmm,xmm/m128 */
+                                         /* vmovups ymm,ymm/m256 */
+    case X86EMUL_OPC_66(0x0f, 0x11):     /* movupd xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
+                                         /* vmovupd ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x11):     /* movss xmm,xmm/m32 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
+    case X86EMUL_OPC_F2(0x0f, 0x11):     /* movsd xmm,xmm/m64 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4396,10 +4451,9 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) ||
-                    ((vex.reg != 0xf) &&
-                     ((ea.type == OP_MEM) ||
-                      !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
+            fail_if((vex.reg != 0xf) &&
+                    ((ea.type == OP_MEM) ||
+                     !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4437,10 +4491,10 @@ x86_emulate(
         break;
     }
 
-    case 0x20: /* mov cr,reg */
-    case 0x21: /* mov dr,reg */
-    case 0x22: /* mov reg,cr */
-    case 0x23: /* mov reg,dr */
+    case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
+    case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
+    case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
+    case X86EMUL_OPC(0x0f, 0x23): /* mov reg,dr */
         generate_exception_if(ea.type != OP_REG, EXC_UD, -1);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         modrm_reg |= lock_prefix << 3;
@@ -4476,7 +4530,7 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x30: /* wrmsr */ {
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ {
         uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);
@@ -4485,7 +4539,7 @@ x86_emulate(
         break;
     }
 
-    case 0x31: rdtsc: /* rdtsc */ {
+    case X86EMUL_OPC(0x0f, 0x31): rdtsc: /* rdtsc */ {
         unsigned long cr4;
         uint64_t val;
         if ( !mode_ring0() )
@@ -4503,7 +4557,7 @@ x86_emulate(
         break;
     }
 
-    case 0x32: /* rdmsr */ {
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ {
         uint64_t val;
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->read_msr == NULL);
@@ -4514,13 +4568,13 @@ x86_emulate(
         break;
     }
 
-    case 0x40 ... 0x4f: /* cmovcc */
+    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
         dst.val = src.val;
         if ( !test_cc(b, _regs.eflags) )
             dst.type = OP_NONE;
         break;
 
-    case 0x34: /* sysenter */ {
+    case X86EMUL_OPC(0x0f, 0x34): /* sysenter */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         int lm;
@@ -4568,7 +4622,7 @@ x86_emulate(
         break;
     }
 
-    case 0x35: /* sysexit */ {
+    case X86EMUL_OPC(0x0f, 0x35): /* sysexit */ {
         uint64_t msr_content;
         struct segment_register cs, ss;
         bool_t user64 = !!(rex_prefix & REX_W);
@@ -4607,18 +4661,26 @@ x86_emulate(
         break;
     }
 
-    case 0xe7: /* movntq mm,m64 */
-               /* {,v}movntdq xmm,m128 */
-               /* vmovntdq ymm,m256 */
+    case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
+    case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
+                                         /* vmovntdq ymm,m256 */
         fail_if(ea.type != OP_MEM);
-        fail_if(vex.pfx == vex_f3);
         /* fall through */
-    case 0x6f: /* movq mm/m64,mm */
-               /* {,v}movdq{a,u} xmm/m128,xmm */
-               /* vmovdq{a,u} ymm/m256,ymm */
-    case 0x7f: /* movq mm,mm/m64 */
-               /* {,v}movdq{a,u} xmm,xmm/m128 */
-               /* vmovdq{a,u} ymm,ymm/m256 */
+    case X86EMUL_OPC(0x0f, 0x6f):        /* movq mm/m64,mm */
+    case X86EMUL_OPC_66(0x0f, 0x6f):     /* movdqa xmm/m128,xmm */
+    case X86EMUL_OPC_F3(0x0f, 0x6f):     /* movdqu xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
+                                         /* vmovdqa ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
+                                         /* vmovdqu ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
+    case X86EMUL_OPC_66(0x0f, 0x7f):     /* movdqa xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
+                                         /* vmovdqa ymm,ymm/m256 */
+    case X86EMUL_OPC_F3(0x0f, 0x7f):     /* movdqu xmm,xmm/m128 */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
+                                         /* vmovdqu ymm,ymm/m256 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4654,8 +4716,7 @@ x86_emulate(
         }
         else
         {
-            fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
-                    ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
+            fail_if(vex.reg != 0xf);
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
@@ -4691,24 +4752,24 @@ x86_emulate(
         break;
     }
 
-    case 0x80 ... 0x8f: /* jcc (near) */
+    case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
         break;
 
-    case 0x90 ... 0x9f: /* setcc */
+    case X86EMUL_OPC(0x0f, 0x90) ... X86EMUL_OPC(0x0f, 0x9f): /* setcc */
         dst.val = test_cc(b, _regs.eflags);
         break;
 
-    case 0xa0: /* push %%fs */
+    case X86EMUL_OPC(0x0f, 0xa0): /* push %%fs */
         src.val = x86_seg_fs;
         goto push_seg;
 
-    case 0xa1: /* pop %%fs */
+    case X86EMUL_OPC(0x0f, 0xa1): /* pop %%fs */
         src.val = x86_seg_fs;
         goto pop_seg;
 
-    case 0xa2: /* cpuid */ {
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ {
         unsigned int eax = _regs.eax, ebx = _regs.ebx;
         unsigned int ecx = _regs.ecx, edx = _regs.edx;
         fail_if(ops->cpuid == NULL);
@@ -4719,15 +4780,15 @@ x86_emulate(
         break;
     }
 
-    case 0xa3: bt: /* bt */
+    case X86EMUL_OPC(0x0f, 0xa3): bt: /* bt */
         emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
 
-    case 0xa4: /* shld imm8,r,r/m */
-    case 0xa5: /* shld %%cl,r,r/m */
-    case 0xac: /* shrd imm8,r,r/m */
-    case 0xad: /* shrd %%cl,r,r/m */ {
+    case X86EMUL_OPC(0x0f, 0xa4): /* shld imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xa5): /* shld %%cl,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xac): /* shrd imm8,r,r/m */
+    case X86EMUL_OPC(0x0f, 0xad): /* shrd %%cl,r,r/m */ {
         uint8_t shift, width = dst.bytes << 3;
 
         generate_exception_if(lock_prefix, EXC_UD, -1);
@@ -4762,24 +4823,23 @@ x86_emulate(
         break;
     }
 
-    case 0xa8: /* push %%gs */
+    case X86EMUL_OPC(0x0f, 0xa8): /* push %%gs */
         src.val = x86_seg_gs;
         goto push_seg;
 
-    case 0xa9: /* pop %%gs */
+    case X86EMUL_OPC(0x0f, 0xa9): /* pop %%gs */
         src.val = x86_seg_gs;
         goto pop_seg;
 
-    case 0xab: bts: /* bts */
+    case X86EMUL_OPC(0x0f, 0xab): bts: /* bts */
         emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
         break;
 
-    case 0xae: /* Grp15 */
+    case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
         case 7: /* clflush{,opt} */
             fail_if(modrm_mod == 3);
-            fail_if(rep_prefix());
             fail_if(ops->wbinvd == NULL);
             if ( (rc = ops->wbinvd(ctxt)) != 0 )
                 goto done;
@@ -4789,11 +4849,11 @@ x86_emulate(
         }
         break;
 
-    case 0xaf: /* imul */
+    case X86EMUL_OPC(0x0f, 0xaf): /* imul */
         emulate_2op_SrcV_srcmem("imul", src, dst, _regs.eflags);
         break;
 
-    case 0xb0 ... 0xb1: /* cmpxchg */
+    case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
         /* Save real source value, then compare EAX against destination. */
         src.orig_val = src.val;
         src.val = _regs.eax;
@@ -4812,34 +4872,34 @@ x86_emulate(
         }
         break;
 
-    case 0xb2: /* lss */
+    case X86EMUL_OPC(0x0f, 0xb2): /* lss */
         dst.val = x86_seg_ss;
         goto les;
 
-    case 0xb3: btr: /* btr */
+    case X86EMUL_OPC(0x0f, 0xb3): btr: /* btr */
         emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
         break;
 
-    case 0xb4: /* lfs */
+    case X86EMUL_OPC(0x0f, 0xb4): /* lfs */
         dst.val = x86_seg_fs;
         goto les;
 
-    case 0xb5: /* lgs */
+    case X86EMUL_OPC(0x0f, 0xb5): /* lgs */
         dst.val = x86_seg_gs;
         goto les;
 
-    case 0xb6: /* movzx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb6): /* movzx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (uint8_t)src.val;
         break;
 
-    case 0xb7: /* movzx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xb7): /* movzx rm16,r{16,32,64} */
         dst.val = (uint16_t)src.val;
         break;
 
-    case 0xba: /* Grp8 */
+    case X86EMUL_OPC(0x0f, 0xba): /* Grp8 */
         switch ( modrm_reg & 7 )
         {
         case 4: goto bt;
@@ -4850,11 +4910,11 @@ x86_emulate(
         }
         break;
 
-    case 0xbb: btc: /* btc */
+    case X86EMUL_OPC(0x0f, 0xbb): btc: /* btc */
         emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
         break;
 
-    case 0xbc: /* bsf or tzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbc): /* bsf or tzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4886,7 +4946,7 @@ x86_emulate(
         break;
     }
 
-    case 0xbd: /* bsr or lzcnt */ {
+    case X86EMUL_OPC(0x0f, 0xbd): /* bsr or lzcnt */ {
         bool_t zf;
 
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
@@ -4922,18 +4982,18 @@ x86_emulate(
         break;
     }
 
-    case 0xbe: /* movsx rm8,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbe): /* movsx rm8,r{16,32,64} */
         /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */
         dst.reg   = decode_register(modrm_reg, &_regs, 0);
         dst.bytes = op_bytes;
         dst.val   = (int8_t)src.val;
         break;
 
-    case 0xbf: /* movsx rm16,r{16,32,64} */
+    case X86EMUL_OPC(0x0f, 0xbf): /* movsx rm16,r{16,32,64} */
         dst.val = (int16_t)src.val;
         break;
 
-    case 0xc0 ... 0xc1: /* xadd */
+    case X86EMUL_OPC(0x0f, 0xc0): case X86EMUL_OPC(0x0f, 0xc1): /* xadd */
         /* Write back the register source. */
         switch ( dst.bytes )
         {
@@ -4944,14 +5004,14 @@ x86_emulate(
         }
         goto add;
 
-    case 0xc3: /* movnti */
+    case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have_sse2();
         generate_exception_if(dst.bytes <= 2, EXC_UD, -1);
         dst.val = src.val;
         break;
 
-    case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
+    case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
         unsigned long old[2], exp[2], new[2];
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD, -1);
@@ -4995,7 +5055,7 @@ x86_emulate(
         break;
     }
 
-    case 0xc8 ... 0xcf: /* bswap */
+    case X86EMUL_OPC(0x0f, 0xc8) ... X86EMUL_OPC(0x0f, 0xcf): /* bswap */
         dst.type = OP_REG;
         dst.reg  = decode_register(
             (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
@@ -5016,72 +5076,57 @@ x86_emulate(
         }
         break;
 
-    default:
-        goto cannot_emulate;
-    }
-    goto writeback;
-
- ext_0f38_insn:
-    switch ( b )
-    {
-    case 0xf0: case 0xf1: /* movbe / crc32 */
-        generate_exception_if(repe_prefix(), EXC_UD, -1);
-        if ( repne_prefix() )
+    case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
+    case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
+        vcpu_must_have_movbe();
+        switch ( op_bytes )
         {
-            /* crc32 */
-#ifdef HAVE_GAS_SSE4_2
-            host_and_vcpu_must_have(sse4_2);
-            dst.bytes = rex_prefix & REX_W ? 8 : 4;
-            switch ( op_bytes )
-            {
-            case 1:
-                asm ( "crc32b %1,%k0" : "+r" (dst.val)
-                                      : "qm" (*(uint8_t *)&src.val) );
-                break;
-            case 2:
-                asm ( "crc32w %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint16_t *)&src.val) );
-                break;
-            case 4:
-                asm ( "crc32l %1,%k0" : "+r" (dst.val)
-                                      : "rm" (*(uint32_t *)&src.val) );
-                break;
-# ifdef __x86_64__
-            case 8:
-                asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
-                break;
-# endif
-            default:
-                ASSERT_UNREACHABLE();
-            }
-#else /* !HAVE_GAS_SSE4_2 */
-            goto cannot_emulate;
+        case 2:
+            asm ( "xchg %h0,%b0" : "=Q" (dst.val)
+                                 : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 4:
+#ifdef __x86_64__
+            asm ( "bswap %k0" : "=r" (dst.val)
+                              : "0" (*(uint32_t *)&src.val) );
+            break;
+        case 8:
 #endif
+            asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
+            break;
+        default:
+            ASSERT_UNREACHABLE();
         }
-        else
+        break;
+#ifdef HAVE_GAS_SSE4_2
+    case X86EMUL_OPC_F2(0x0f38, 0xf0): /* crc32 r/m8, r{32,64} */
+    case X86EMUL_OPC_F2(0x0f38, 0xf1): /* crc32 r/m{16,32,64}, r{32,64} */
+        host_and_vcpu_must_have(sse4_2);
+        dst.bytes = rex_prefix & REX_W ? 8 : 4;
+        switch ( op_bytes )
         {
-            /* movbe */
-            vcpu_must_have_movbe();
-            switch ( op_bytes )
-            {
-            case 2:
-                asm ( "xchg %h0,%b0" : "=Q" (dst.val)
-                                     : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 4:
-#ifdef __x86_64__
-                asm ( "bswap %k0" : "=r" (dst.val)
-                                  : "0" (*(uint32_t *)&src.val) );
-                break;
-            case 8:
-#endif
-                asm ( "bswap %0" : "=r" (dst.val) : "0" (src.val) );
-                break;
-            default:
-                ASSERT_UNREACHABLE();
-            }
+        case 1:
+            asm ( "crc32b %1,%k0" : "+r" (dst.val)
+                                  : "qm" (*(uint8_t *)&src.val) );
+            break;
+        case 2:
+            asm ( "crc32w %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint16_t *)&src.val) );
+            break;
+        case 4:
+            asm ( "crc32l %1,%k0" : "+r" (dst.val)
+                                  : "rm" (*(uint32_t *)&src.val) );
+            break;
+# ifdef __x86_64__
+        case 8:
+            asm ( "crc32q %1,%0" : "+r" (dst.val) : "rm" (src.val) );
+            break;
+# endif
+        default:
+            ASSERT_UNREACHABLE();
         }
         break;
+#endif
     default:
         goto cannot_emulate;
     }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -415,12 +415,15 @@ struct x86_emulate_ctxt
     /* Stack pointer width in bits (16, 32 or 64). */
     unsigned int sp_size;
 
-    /* Set this if writes may have side effects. */
-    uint8_t force_writeback;
+    /* Canonical opcode (see below). */
+    unsigned int opcode;
 
     /* Software event injection support. */
     enum x86_swint_emulation swint_emulate;
 
+    /* Set this if writes may have side effects. */
+    uint8_t force_writeback;
+
     /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */
     union {
         struct {
@@ -435,6 +438,60 @@ struct x86_emulate_ctxt
     void *data;
 };
 
+/*
+ * Encode opcode extensions in the following way:
+ *     0x0xxxx for one byte opcodes
+ *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
+ *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
+ *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
+ *  0x8f0axxxx for 8f/a-prefixed XOP opcodes
+ * The low byte represents the base opcode withing the resepctive space,
+ * and some of bits 8..15 are used for encoding further information (see
+ * below).
+ * Hence no separate #define-s get added.
+ */
+#define X86EMUL_OPC_EXT_MASK         0xffff0000
+#define X86EMUL_OPC(ext, byte)       ((uint8_t)(byte) | \
+                                      MASK_INSR((ext), X86EMUL_OPC_EXT_MASK))
+/*
+ * This includes the 66, F3, and F2 prefixes (see also below)
+ * as well as VEX/EVEX:
+ */
+#define X86EMUL_OPC_MASK             (0x000000ff | X86EMUL_OPC_PFX_MASK | \
+                                     X86EMUL_OPC_ENCODING_MASK)
+
+/*
+ * Note that prefixes 66, F2, and F3 get encoded only when semantically
+ * meaningful, to reduce the complexity of interpreting this representation.
+ */
+#define X86EMUL_OPC_PFX_MASK         0x00000300
+# define X86EMUL_OPC_66(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000100)
+# define X86EMUL_OPC_F3(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000200)
+# define X86EMUL_OPC_F2(ext, byte)   (X86EMUL_OPC(ext, byte) | 0x00000300)
+
+#define X86EMUL_OPC_ENCODING_MASK    0x00003000
+#define X86EMUL_OPC_LEGACY_          0x00000000
+#define X86EMUL_OPC_VEX_             0x00001000
+# define X86EMUL_OPC_VEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_VEX_)
+# define X86EMUL_OPC_VEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_VEX_)
+#define X86EMUL_OPC_EVEX_            0x00002000
+# define X86EMUL_OPC_EVEX(ext, byte) \
+    (X86EMUL_OPC(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_66(ext, byte) \
+    (X86EMUL_OPC_66(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F3(ext, byte) \
+    (X86EMUL_OPC_F3(ext, byte) | X86EMUL_OPC_EVEX_)
+# define X86EMUL_OPC_EVEX_F2(ext, byte) \
+    (X86EMUL_OPC_F2(ext, byte) | X86EMUL_OPC_EVEX_)
+
 struct x86_emulate_stub {
     union {
         void (*func)(void);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 08/16] SVM: use generic instruction decoding
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (6 preceding siblings ...)
  2016-09-28  8:12 ` [PATCH v2 07/16] x86emul: generate and make use of a canonical opcode representation Jan Beulich
@ 2016-09-28  8:13 ` Jan Beulich
  2016-09-29 19:24   ` Andrew Cooper
  2016-09-28  8:13 ` [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
                   ` (8 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Boris Ostrovsky, Suravee Suthikulpanit

[-- Attachment #1: Type: text/plain, Size: 18411 bytes --]

... instead of custom handling. To facilitate this break out init code
from _hvm_emulate_one() into the new hvm_emulate_init(), and make
hvmemul_insn_fetch( globally available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add comment to caller field. Rename REG_POISON to PTR_POISON. Align
    opc_tab[] initializer lines.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -835,7 +835,7 @@ static int hvmemul_read(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-static int hvmemul_insn_fetch(
+int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
     void *p_data,
@@ -1765,15 +1765,14 @@ static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
-static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes)
 {
-    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
-    uint32_t new_intr_shadow, pfec = PFEC_page_present;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    unsigned int pfec = PFEC_page_present;
     unsigned long addr;
-    int rc;
 
     if ( hvm_long_mode_enabled(curr) &&
          hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
@@ -1791,14 +1790,14 @@ static int _hvm_emulate_one(struct hvm_e
     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip;
+    if ( !insn_bytes )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
-                                        regs->eip,
+                                        hvmemul_ctxt->insn_buf_eip,
                                         sizeof(hvmemul_ctxt->insn_buf),
                                         hvm_access_insn_fetch,
                                         hvmemul_ctxt->ctxt.addr_size,
@@ -1810,11 +1809,24 @@ static int _hvm_emulate_one(struct hvm_e
     }
     else
     {
-        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
     }
 
     hvmemul_ctxt->exn_pending = 0;
+}
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+
+    hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes);
+
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -15,7 +15,7 @@
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/config.h>
+#include <xen/err.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -26,41 +26,6 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-static unsigned int is_prefix(u8 opc)
-{
-    switch ( opc )
-    {
-    case 0x66:
-    case 0x67:
-    case 0x2E:
-    case 0x3E:
-    case 0x26:
-    case 0x64:
-    case 0x65:
-    case 0x36:
-    case 0xF0:
-    case 0xF3:
-    case 0xF2:
-    case 0x40 ... 0x4f:
-        return 1;
-    }
-    return 0;
-}
-
-static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit)
-{
-    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned long p = vmcb->cs.base + vmcb->rip;
-
-    if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) )
-    {
-        *limit = vmcb->cs.limit;
-        return (u32)p; /* mask to 32 bits */
-    }
-    *limit = ~0UL;
-    return p;
-}
-
 static unsigned long svm_nextrip_insn_length(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
     return vmcb->nextrip - vmcb->rip;
 }
 
-/* First byte: Length. Following bytes: Opcode bytes. */
-#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
-MAKE_INSTR(INVD,   2, 0x0f, 0x08);
-MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
-MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
-MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
-MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
-MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
-MAKE_INSTR(HLT,    1, 0xf4);
-MAKE_INSTR(INT3,   1, 0xcc);
-MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
-MAKE_INSTR(PAUSE,  1, 0x90);
-MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
-MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
-MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
-MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
-MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
-MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
-MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
-
-static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
-{
-    [INSTR_INVD]   = OPCODE_INVD,
-    [INSTR_WBINVD] = OPCODE_WBINVD,
-    [INSTR_CPUID]  = OPCODE_CPUID,
-    [INSTR_RDMSR]  = OPCODE_RDMSR,
-    [INSTR_WRMSR]  = OPCODE_WRMSR,
-    [INSTR_VMCALL] = OPCODE_VMCALL,
-    [INSTR_HLT]    = OPCODE_HLT,
-    [INSTR_INT3]   = OPCODE_INT3,
-    [INSTR_RDTSC]  = OPCODE_RDTSC,
-    [INSTR_PAUSE]  = OPCODE_PAUSE,
-    [INSTR_XSETBV] = OPCODE_XSETBV,
-    [INSTR_VMRUN]  = OPCODE_VMRUN,
-    [INSTR_VMLOAD] = OPCODE_VMLOAD,
-    [INSTR_VMSAVE] = OPCODE_VMSAVE,
-    [INSTR_STGI]   = OPCODE_STGI,
-    [INSTR_CLGI]   = OPCODE_CLGI,
-    [INSTR_INVLPGA] = OPCODE_INVLPGA,
+static const struct {
+    unsigned int opcode;
+    struct {
+        unsigned int rm:3;
+        unsigned int reg:3;
+        unsigned int mod:2;
+#define MODRM(mod, reg, rm) { rm, reg, mod }
+    } modrm;
+} const opc_tab[INSTR_MAX_COUNT] = {
+    [INSTR_PAUSE]   = { X86EMUL_OPC_F3(0, 0x90) },
+    [INSTR_INT3]    = { X86EMUL_OPC(   0, 0xcc) },
+    [INSTR_HLT]     = { X86EMUL_OPC(   0, 0xf4) },
+    [INSTR_XSETBV]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
+    [INSTR_VMRUN]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
+    [INSTR_VMCALL]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
+    [INSTR_VMLOAD]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
+    [INSTR_VMSAVE]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
+    [INSTR_STGI]    = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
+    [INSTR_CLGI]    = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
+    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
+    [INSTR_INVD]    = { X86EMUL_OPC(0x0f, 0x08) },
+    [INSTR_WBINVD]  = { X86EMUL_OPC(0x0f, 0x09) },
+    [INSTR_WRMSR]   = { X86EMUL_OPC(0x0f, 0x30) },
+    [INSTR_RDTSC]   = { X86EMUL_OPC(0x0f, 0x31) },
+    [INSTR_RDMSR]   = { X86EMUL_OPC(0x0f, 0x32) },
+    [INSTR_CPUID]   = { X86EMUL_OPC(0x0f, 0xa2) },
 };
 
-static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
-                    unsigned long addr, unsigned int len)
-{
-    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
-
-    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
-    {
-    case HVMCOPY_okay:
-        break;
-    case HVMCOPY_bad_gva_to_gfn:
-        /* OK just to give up; we'll have injected #PF already */
-        return 0;
-    default:
-        /* Not OK: fetches from non-RAM pages are not supportable. */
-        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
-                 vmcb->rip, addr);
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 0;
-    }
-    return 1;
-}
-
 int __get_instruction_length_from_list(struct vcpu *v,
         const enum instruction_index *list, unsigned int list_count)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int i, j, inst_len = 0;
-    enum instruction_index instr = 0;
-    u8 buf[MAX_INST_LEN];
-    const u8 *opcode = NULL;
-    unsigned long fetch_addr, fetch_limit;
-    unsigned int fetch_len, max_len;
+    struct hvm_emulate_ctxt ctxt;
+    struct x86_emulate_state *state;
+    unsigned int inst_len, j, modrm_rm, modrm_reg;
+    int modrm_mod;
 
+#ifdef NDEBUG
     if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
         return inst_len;
 
     if ( vmcb->exitcode == VMEXIT_IOIO )
         return vmcb->exitinfo2 - vmcb->rip;
+#endif
 
-    /* Fetch up to the next page break; we'll fetch from the next page
-     * later if we have to. */
-    fetch_addr = svm_rip2pointer(v, &fetch_limit);
-    if ( vmcb->rip > fetch_limit )
-        return 0;
-    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
-    fetch_len = min_t(unsigned int, max_len,
-                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
-    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
+    ASSERT(v == current);
+    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+    hvm_emulate_init(&ctxt, NULL, 0);
+    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
+    if ( IS_ERR_OR_NULL(state) )
         return 0;
 
-    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
-    {
-        inst_len++;
-        if ( inst_len >= fetch_len )
-        {
-            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                        max_len - fetch_len) )
-                return 0;
-            fetch_len = max_len;
-        }
+    inst_len = x86_insn_length(state, &ctxt.ctxt);
+    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
+    x86_emulate_free_state(state);
+#ifndef NDEBUG
+    if ( vmcb->exitcode == VMEXIT_IOIO )
+        j = vmcb->exitinfo2 - vmcb->rip;
+    else
+        j = svm_nextrip_insn_length(v);
+    if ( j && j != inst_len )
+    {
+        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
+                ctxt.ctxt.opcode, inst_len, j);
+        return j;
     }
+#endif
 
     for ( j = 0; j < list_count; j++ )
     {
-        instr = list[j];
-        opcode = opc_bytes[instr];
+        enum instruction_index instr = list[j];
 
-        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
+        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
+        if ( opc_tab[instr].opcode == ctxt.ctxt.opcode )
         {
-            if ( (inst_len + i) >= fetch_len ) 
-            {
-                if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                            max_len - fetch_len) )
-                    return 0;
-                fetch_len = max_len;
-            }
+            if ( !opc_tab[instr].modrm.mod )
+                return inst_len;
 
-            if ( buf[inst_len+i] != opcode[i+1] )
-                goto mismatch;
+            if ( modrm_mod == opc_tab[instr].modrm.mod &&
+                 (modrm_rm & 7) == opc_tab[instr].modrm.rm &&
+                 (modrm_reg & 7) == opc_tab[instr].modrm.reg )
+                return inst_len;
         }
-        goto done;
-    mismatch: ;
     }
 
     gdprintk(XENLOG_WARNING,
-             "%s: Mismatch between expected and actual instruction bytes: "
+             "%s: Mismatch between expected and actual instruction: "
              "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     hvm_inject_hw_exception(TRAP_gp_fault, 0);
     return 0;
-
- done:
-    inst_len += opcode[0];
-    ASSERT(inst_len <= max_len);
-    return inst_len;
 }
 
 /*
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -382,9 +382,9 @@ struct operand {
     } mem;
 };
 #ifdef __x86_64__
-#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
+#define PTR_POISON ((void *)0x8086000000008086UL) /* non-canonical */
 #else
-#define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
+#define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
 
 typedef union {
@@ -1646,6 +1646,14 @@ struct x86_emulate_state {
 
     unsigned long eip;
     struct cpu_user_regs *regs;
+
+#ifndef NDEBUG
+    /*
+     * Track caller of x86_decode_insn() to spot missing as well as
+     * premature calls to x86_emulate_free_state().
+     */
+    void *caller;
+#endif
 };
 
 /* Helper definitions. */
@@ -1673,6 +1681,11 @@ x86_decode_onebyte(
 
     switch ( ctxt->opcode )
     {
+    case 0x90: /* nop / pause */
+        if ( repe_prefix() )
+            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
+        break;
+
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
         generate_exception_if(mode_64bit(), EXC_UD, -1);
@@ -1780,7 +1793,7 @@ x86_decode(
     override_seg = -1;
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
-    ea.reg = REG_POISON;
+    ea.reg = PTR_POISON;
     state->regs = ctxt->regs;
     state->eip = ctxt->regs->eip;
 
@@ -2267,8 +2280,8 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
-    struct operand src = { .reg = REG_POISON };
-    struct operand dst = { .reg = REG_POISON };
+    struct operand src = { .reg = PTR_POISON };
+    struct operand dst = { .reg = PTR_POISON };
     enum x86_swint_type swint_type;
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
@@ -2861,8 +2874,9 @@ x86_emulate(
         break;
 
     case 0x90: /* nop / xchg %%r8,%%rax */
+    case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */
         if ( !(rex_prefix & 1) )
-            break; /* nop */
+            break; /* nop / pause */
         /* fall through */
 
     case 0x91 ... 0x97: /* xchg reg,%%rax */
@@ -5200,3 +5214,89 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
+
+#ifdef __XEN__
+
+#include <xen/err.h>
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt))
+{
+    static DEFINE_PER_CPU(struct x86_emulate_state, state);
+    struct x86_emulate_state *state = &this_cpu(state);
+    const struct x86_emulate_ops ops = {
+        .insn_fetch = insn_fetch,
+        .read       = x86emul_unhandleable_rw,
+        .write      = PTR_POISON,
+        .cmpxchg    = PTR_POISON,
+    };
+    int rc = x86_decode(state, ctxt, &ops);
+
+    if ( unlikely(rc != X86EMUL_OKAY) )
+        return ERR_PTR(-rc);
+
+#ifndef NDEBUG
+    /*
+     * While we avoid memory allocation (by use of per-CPU data) above,
+     * nevertheless make sure callers properly release the state structure
+     * for forward compatibility.
+     */
+    if ( state->caller )
+    {
+        printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n",
+               state->caller);
+        dump_execution_state();
+    }
+    state->caller = __builtin_return_address(0);
+#endif
+
+    return state;
+}
+
+static inline void check_state(const struct x86_emulate_state *state)
+{
+#ifndef NDEBUG
+    ASSERT(state->caller);
+#endif
+}
+
+#ifndef NDEBUG
+void x86_emulate_free_state(struct x86_emulate_state *state)
+{
+    check_state(state);
+    state->caller = NULL;
+}
+#endif
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg)
+{
+    check_state(state);
+
+    if ( !(state->desc & ModRM) )
+        return -EINVAL;
+
+    if ( rm )
+        *rm = state->modrm_rm;
+    if ( reg )
+        *reg = state->modrm_reg;
+
+    return state->modrm_mod;
+}
+
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt)
+{
+    check_state(state);
+
+    return state->eip - ctxt->regs->eip;
+}
+
+#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -532,4 +532,29 @@ x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
+#ifdef __XEN__
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt));
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg);
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt);
+
+#ifdef NDEBUG
+static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+#else
+void x86_emulate_free_state(struct x86_emulate_state *state);
+#endif
+
+#endif
+
 #endif /* __X86_EMULATE_H__ */
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -53,6 +53,10 @@ void hvm_mem_access_emulate_one(enum emu
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 struct segment_register *hvmemul_get_seg_reg(
@@ -60,6 +64,11 @@ struct segment_register *hvmemul_get_seg
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
+int hvmemul_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt);
 int hvmemul_do_pio_buffer(uint16_t port,
                           unsigned int size,
                           uint8_t dir,



[-- Attachment #2: SVM-use-generic-decode.patch --]
[-- Type: text/plain, Size: 18448 bytes --]

SVM: use generic instruction decoding

... instead of custom handling. To facilitate this break out init code
from _hvm_emulate_one() into the new hvm_emulate_init(), and make
hvmemul_insn_fetch( globally available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add comment to caller field. Rename REG_POISON to PTR_POISON. Align
    opc_tab[] initializer lines.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -835,7 +835,7 @@ static int hvmemul_read(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-static int hvmemul_insn_fetch(
+int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
     void *p_data,
@@ -1765,15 +1765,14 @@ static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
-static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes)
 {
-    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
-    uint32_t new_intr_shadow, pfec = PFEC_page_present;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    unsigned int pfec = PFEC_page_present;
     unsigned long addr;
-    int rc;
 
     if ( hvm_long_mode_enabled(curr) &&
          hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
@@ -1791,14 +1790,14 @@ static int _hvm_emulate_one(struct hvm_e
     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip;
+    if ( !insn_bytes )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
-                                        regs->eip,
+                                        hvmemul_ctxt->insn_buf_eip,
                                         sizeof(hvmemul_ctxt->insn_buf),
                                         hvm_access_insn_fetch,
                                         hvmemul_ctxt->ctxt.addr_size,
@@ -1810,11 +1809,24 @@ static int _hvm_emulate_one(struct hvm_e
     }
     else
     {
-        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
     }
 
     hvmemul_ctxt->exn_pending = 0;
+}
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+
+    hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes);
+
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -15,7 +15,7 @@
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/config.h>
+#include <xen/err.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -26,41 +26,6 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-static unsigned int is_prefix(u8 opc)
-{
-    switch ( opc )
-    {
-    case 0x66:
-    case 0x67:
-    case 0x2E:
-    case 0x3E:
-    case 0x26:
-    case 0x64:
-    case 0x65:
-    case 0x36:
-    case 0xF0:
-    case 0xF3:
-    case 0xF2:
-    case 0x40 ... 0x4f:
-        return 1;
-    }
-    return 0;
-}
-
-static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit)
-{
-    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned long p = vmcb->cs.base + vmcb->rip;
-
-    if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) )
-    {
-        *limit = vmcb->cs.limit;
-        return (u32)p; /* mask to 32 bits */
-    }
-    *limit = ~0UL;
-    return p;
-}
-
 static unsigned long svm_nextrip_insn_length(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
     return vmcb->nextrip - vmcb->rip;
 }
 
-/* First byte: Length. Following bytes: Opcode bytes. */
-#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
-MAKE_INSTR(INVD,   2, 0x0f, 0x08);
-MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
-MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
-MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
-MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
-MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
-MAKE_INSTR(HLT,    1, 0xf4);
-MAKE_INSTR(INT3,   1, 0xcc);
-MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
-MAKE_INSTR(PAUSE,  1, 0x90);
-MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
-MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
-MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
-MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
-MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
-MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
-MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
-
-static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
-{
-    [INSTR_INVD]   = OPCODE_INVD,
-    [INSTR_WBINVD] = OPCODE_WBINVD,
-    [INSTR_CPUID]  = OPCODE_CPUID,
-    [INSTR_RDMSR]  = OPCODE_RDMSR,
-    [INSTR_WRMSR]  = OPCODE_WRMSR,
-    [INSTR_VMCALL] = OPCODE_VMCALL,
-    [INSTR_HLT]    = OPCODE_HLT,
-    [INSTR_INT3]   = OPCODE_INT3,
-    [INSTR_RDTSC]  = OPCODE_RDTSC,
-    [INSTR_PAUSE]  = OPCODE_PAUSE,
-    [INSTR_XSETBV] = OPCODE_XSETBV,
-    [INSTR_VMRUN]  = OPCODE_VMRUN,
-    [INSTR_VMLOAD] = OPCODE_VMLOAD,
-    [INSTR_VMSAVE] = OPCODE_VMSAVE,
-    [INSTR_STGI]   = OPCODE_STGI,
-    [INSTR_CLGI]   = OPCODE_CLGI,
-    [INSTR_INVLPGA] = OPCODE_INVLPGA,
+static const struct {
+    unsigned int opcode;
+    struct {
+        unsigned int rm:3;
+        unsigned int reg:3;
+        unsigned int mod:2;
+#define MODRM(mod, reg, rm) { rm, reg, mod }
+    } modrm;
+} const opc_tab[INSTR_MAX_COUNT] = {
+    [INSTR_PAUSE]   = { X86EMUL_OPC_F3(0, 0x90) },
+    [INSTR_INT3]    = { X86EMUL_OPC(   0, 0xcc) },
+    [INSTR_HLT]     = { X86EMUL_OPC(   0, 0xf4) },
+    [INSTR_XSETBV]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
+    [INSTR_VMRUN]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
+    [INSTR_VMCALL]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
+    [INSTR_VMLOAD]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
+    [INSTR_VMSAVE]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
+    [INSTR_STGI]    = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
+    [INSTR_CLGI]    = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
+    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
+    [INSTR_INVD]    = { X86EMUL_OPC(0x0f, 0x08) },
+    [INSTR_WBINVD]  = { X86EMUL_OPC(0x0f, 0x09) },
+    [INSTR_WRMSR]   = { X86EMUL_OPC(0x0f, 0x30) },
+    [INSTR_RDTSC]   = { X86EMUL_OPC(0x0f, 0x31) },
+    [INSTR_RDMSR]   = { X86EMUL_OPC(0x0f, 0x32) },
+    [INSTR_CPUID]   = { X86EMUL_OPC(0x0f, 0xa2) },
 };
 
-static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
-                    unsigned long addr, unsigned int len)
-{
-    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
-
-    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
-    {
-    case HVMCOPY_okay:
-        break;
-    case HVMCOPY_bad_gva_to_gfn:
-        /* OK just to give up; we'll have injected #PF already */
-        return 0;
-    default:
-        /* Not OK: fetches from non-RAM pages are not supportable. */
-        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
-                 vmcb->rip, addr);
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 0;
-    }
-    return 1;
-}
-
 int __get_instruction_length_from_list(struct vcpu *v,
         const enum instruction_index *list, unsigned int list_count)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int i, j, inst_len = 0;
-    enum instruction_index instr = 0;
-    u8 buf[MAX_INST_LEN];
-    const u8 *opcode = NULL;
-    unsigned long fetch_addr, fetch_limit;
-    unsigned int fetch_len, max_len;
+    struct hvm_emulate_ctxt ctxt;
+    struct x86_emulate_state *state;
+    unsigned int inst_len, j, modrm_rm, modrm_reg;
+    int modrm_mod;
 
+#ifdef NDEBUG
     if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
         return inst_len;
 
     if ( vmcb->exitcode == VMEXIT_IOIO )
         return vmcb->exitinfo2 - vmcb->rip;
+#endif
 
-    /* Fetch up to the next page break; we'll fetch from the next page
-     * later if we have to. */
-    fetch_addr = svm_rip2pointer(v, &fetch_limit);
-    if ( vmcb->rip > fetch_limit )
-        return 0;
-    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
-    fetch_len = min_t(unsigned int, max_len,
-                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
-    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
+    ASSERT(v == current);
+    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+    hvm_emulate_init(&ctxt, NULL, 0);
+    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
+    if ( IS_ERR_OR_NULL(state) )
         return 0;
 
-    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
-    {
-        inst_len++;
-        if ( inst_len >= fetch_len )
-        {
-            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                        max_len - fetch_len) )
-                return 0;
-            fetch_len = max_len;
-        }
+    inst_len = x86_insn_length(state, &ctxt.ctxt);
+    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
+    x86_emulate_free_state(state);
+#ifndef NDEBUG
+    if ( vmcb->exitcode == VMEXIT_IOIO )
+        j = vmcb->exitinfo2 - vmcb->rip;
+    else
+        j = svm_nextrip_insn_length(v);
+    if ( j && j != inst_len )
+    {
+        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
+                ctxt.ctxt.opcode, inst_len, j);
+        return j;
     }
+#endif
 
     for ( j = 0; j < list_count; j++ )
     {
-        instr = list[j];
-        opcode = opc_bytes[instr];
+        enum instruction_index instr = list[j];
 
-        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
+        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
+        if ( opc_tab[instr].opcode == ctxt.ctxt.opcode )
         {
-            if ( (inst_len + i) >= fetch_len ) 
-            {
-                if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                            max_len - fetch_len) )
-                    return 0;
-                fetch_len = max_len;
-            }
+            if ( !opc_tab[instr].modrm.mod )
+                return inst_len;
 
-            if ( buf[inst_len+i] != opcode[i+1] )
-                goto mismatch;
+            if ( modrm_mod == opc_tab[instr].modrm.mod &&
+                 (modrm_rm & 7) == opc_tab[instr].modrm.rm &&
+                 (modrm_reg & 7) == opc_tab[instr].modrm.reg )
+                return inst_len;
         }
-        goto done;
-    mismatch: ;
     }
 
     gdprintk(XENLOG_WARNING,
-             "%s: Mismatch between expected and actual instruction bytes: "
+             "%s: Mismatch between expected and actual instruction: "
              "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     hvm_inject_hw_exception(TRAP_gp_fault, 0);
     return 0;
-
- done:
-    inst_len += opcode[0];
-    ASSERT(inst_len <= max_len);
-    return inst_len;
 }
 
 /*
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -382,9 +382,9 @@ struct operand {
     } mem;
 };
 #ifdef __x86_64__
-#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
+#define PTR_POISON ((void *)0x8086000000008086UL) /* non-canonical */
 #else
-#define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
+#define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
 
 typedef union {
@@ -1646,6 +1646,14 @@ struct x86_emulate_state {
 
     unsigned long eip;
     struct cpu_user_regs *regs;
+
+#ifndef NDEBUG
+    /*
+     * Track caller of x86_decode_insn() to spot missing as well as
+     * premature calls to x86_emulate_free_state().
+     */
+    void *caller;
+#endif
 };
 
 /* Helper definitions. */
@@ -1673,6 +1681,11 @@ x86_decode_onebyte(
 
     switch ( ctxt->opcode )
     {
+    case 0x90: /* nop / pause */
+        if ( repe_prefix() )
+            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
+        break;
+
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
         generate_exception_if(mode_64bit(), EXC_UD, -1);
@@ -1780,7 +1793,7 @@ x86_decode(
     override_seg = -1;
     ea.type = OP_MEM;
     ea.mem.seg = x86_seg_ds;
-    ea.reg = REG_POISON;
+    ea.reg = PTR_POISON;
     state->regs = ctxt->regs;
     state->eip = ctxt->regs->eip;
 
@@ -2267,8 +2280,8 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
-    struct operand src = { .reg = REG_POISON };
-    struct operand dst = { .reg = REG_POISON };
+    struct operand src = { .reg = PTR_POISON };
+    struct operand dst = { .reg = PTR_POISON };
     enum x86_swint_type swint_type;
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
@@ -2861,8 +2874,9 @@ x86_emulate(
         break;
 
     case 0x90: /* nop / xchg %%r8,%%rax */
+    case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */
         if ( !(rex_prefix & 1) )
-            break; /* nop */
+            break; /* nop / pause */
         /* fall through */
 
     case 0x91 ... 0x97: /* xchg reg,%%rax */
@@ -5200,3 +5214,89 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
+
+#ifdef __XEN__
+
+#include <xen/err.h>
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt))
+{
+    static DEFINE_PER_CPU(struct x86_emulate_state, state);
+    struct x86_emulate_state *state = &this_cpu(state);
+    const struct x86_emulate_ops ops = {
+        .insn_fetch = insn_fetch,
+        .read       = x86emul_unhandleable_rw,
+        .write      = PTR_POISON,
+        .cmpxchg    = PTR_POISON,
+    };
+    int rc = x86_decode(state, ctxt, &ops);
+
+    if ( unlikely(rc != X86EMUL_OKAY) )
+        return ERR_PTR(-rc);
+
+#ifndef NDEBUG
+    /*
+     * While we avoid memory allocation (by use of per-CPU data) above,
+     * nevertheless make sure callers properly release the state structure
+     * for forward compatibility.
+     */
+    if ( state->caller )
+    {
+        printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n",
+               state->caller);
+        dump_execution_state();
+    }
+    state->caller = __builtin_return_address(0);
+#endif
+
+    return state;
+}
+
+static inline void check_state(const struct x86_emulate_state *state)
+{
+#ifndef NDEBUG
+    ASSERT(state->caller);
+#endif
+}
+
+#ifndef NDEBUG
+void x86_emulate_free_state(struct x86_emulate_state *state)
+{
+    check_state(state);
+    state->caller = NULL;
+}
+#endif
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg)
+{
+    check_state(state);
+
+    if ( !(state->desc & ModRM) )
+        return -EINVAL;
+
+    if ( rm )
+        *rm = state->modrm_rm;
+    if ( reg )
+        *reg = state->modrm_reg;
+
+    return state->modrm_mod;
+}
+
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt)
+{
+    check_state(state);
+
+    return state->eip - ctxt->regs->eip;
+}
+
+#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -532,4 +532,29 @@ x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
+#ifdef __XEN__
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt));
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg);
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt);
+
+#ifdef NDEBUG
+static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+#else
+void x86_emulate_free_state(struct x86_emulate_state *state);
+#endif
+
+#endif
+
 #endif /* __X86_EMULATE_H__ */
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -53,6 +53,10 @@ void hvm_mem_access_emulate_one(enum emu
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 struct segment_register *hvmemul_get_seg_reg(
@@ -60,6 +64,11 @@ struct segment_register *hvmemul_get_seg
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
+int hvmemul_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt);
 int hvmemul_do_pio_buffer(uint16_t port,
                           unsigned int size,
                           uint8_t dir,

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (7 preceding siblings ...)
  2016-09-28  8:13 ` [PATCH v2 08/16] SVM: use generic instruction decoding Jan Beulich
@ 2016-09-28  8:13 ` Jan Beulich
  2016-09-29 19:47   ` Andrew Cooper
  2016-09-28  8:14 ` [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
                   ` (7 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 14035 bytes --]

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3167,13 +3168,92 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( is_pv_32bit_vcpu(current) )
+        addr = (uint32_t)addr;
+
+    if ( !__addr_ok(addr) ||
+         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             goc->insn_fetch && cpu_has_nx
+                             ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3195,7 +3275,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3204,179 +3285,59 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
+            do_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
+            enum x86_segment seg;
+
+        case 5:
             ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
+            /* fall through */
+        case 3:
             ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
-            break;
-        case 0xff:
-            {
-                unsigned int modrm;
-
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-    {
-        do_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
+    if ( rc == X86EMUL_EXCEPTION )
+       return;
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3517,7 +3478,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5273,6 +5273,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5290,6 +5298,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -542,9 +542,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);



[-- Attachment #2: x86-32on64-gate-op-generic-decode.patch --]
[-- Type: text/plain, Size: 14103 bytes --]

x86/32on64: use generic instruction decoding for call gate emulation

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3167,13 +3168,92 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( is_pv_32bit_vcpu(current) )
+        addr = (uint32_t)addr;
+
+    if ( !__addr_ok(addr) ||
+         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             goc->insn_fetch && cpu_has_nx
+                             ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3195,7 +3275,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3204,179 +3285,59 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
+            do_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
+            enum x86_segment seg;
+
+        case 5:
             ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
+            /* fall through */
+        case 3:
             ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
-            break;
-        case 0xff:
-            {
-                unsigned int modrm;
-
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-    {
-        do_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
+    if ( rc == X86EMUL_EXCEPTION )
+       return;
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3517,7 +3478,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5273,6 +5273,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5290,6 +5298,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -542,9 +542,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (8 preceding siblings ...)
  2016-09-28  8:13 ` [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-09-28  8:14 ` Jan Beulich
  2016-09-29 20:01   ` Andrew Cooper
  2016-09-28  8:15 ` [PATCH v2 11/16] x86/PV: split out dealing with DRn " Jan Beulich
                   ` (6 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7040 bytes --]

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2255,6 +2255,107 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Read CR0 */
+        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+        return X86EMUL_OKAY;
+
+    case 2: /* Read CR2 */
+    case 4: /* Read CR4 */
+        *val = curr->arch.pv_vcpu.ctrlreg[reg];
+        return X86EMUL_OKAY;
+
+    case 3: /* Read CR3 */
+    {
+        const struct domain *currd = curr->domain;
+        unsigned long mfn;
+
+        if ( !is_pv_32bit_domain(currd) )
+        {
+            mfn = pagetable_get_pfn(curr->arch.guest_table);
+            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        else
+        {
+            l4_pgentry_t *pl4e =
+                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+            mfn = l4e_get_pfn(*pl4e);
+            unmap_domain_page(pl4e);
+            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        /* PTs should not be shared */
+        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+        return X86EMUL_OKAY;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Write CR0 */
+        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+        {
+            gdprintk(XENLOG_WARNING,
+                    "Attempt to change unmodifiable CR0 flags\n");
+            break;
+        }
+        do_fpu_taskswitch(!!(val & X86_CR0_TS));
+        return X86EMUL_OKAY;
+
+    case 2: /* Write CR2 */
+        curr->arch.pv_vcpu.ctrlreg[2] = val;
+        arch_set_cr2(curr, val);
+        return X86EMUL_OKAY;
+
+    case 3: /* Write CR3 */
+    {
+        struct domain *currd = curr->domain;
+        unsigned long gfn;
+        struct page_info *page;
+        int rc;
+
+        gfn = !is_pv_32bit_domain(currd)
+              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+        if ( !page )
+            break;
+        rc = new_guest_cr3(page_to_mfn(page));
+        put_page(page);
+
+        switch ( rc )
+        {
+        case 0:
+            return X86EMUL_OKAY;
+        case -ERESTART: /* retry after preemption */
+            return X86EMUL_RETRY;
+        }
+        break;
+    }
+
+    case 4: /* Write CR4 */
+        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+        write_cr4(pv_guest_cr4_to_real_cr4(curr));
+        ctxt_switch_levelling(curr);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2667,48 +2768,9 @@ static int emulate_privileged_op(struct
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
-        {
-        case 0: /* Read CR0 */
-            *reg = (read_cr0() & ~X86_CR0_TS) |
-                v->arch.pv_vcpu.ctrlreg[0];
-            break;
-
-        case 2: /* Read CR2 */
-            *reg = v->arch.pv_vcpu.ctrlreg[2];
-            break;
-            
-        case 3: /* Read CR3 */
-        {
-            unsigned long mfn;
-            
-            if ( !is_pv_32bit_domain(currd) )
-            {
-                mfn = pagetable_get_pfn(v->arch.guest_table);
-                *reg = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            else
-            {
-                l4_pgentry_t *pl4e =
-                    map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
-
-                mfn = l4e_get_pfn(*pl4e);
-                unmap_domain_page(pl4e);
-                *reg = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            /* PTs should not be shared */
-            BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
-        }
-        break;
-
-        case 4: /* Read CR4 */
-            *reg = v->arch.pv_vcpu.ctrlreg[4];
-            break;
-
-        default:
+        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        }
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
@@ -2732,56 +2794,12 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
+        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
         {
-        case 0: /* Write CR0 */
-            if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
-            {
-                gdprintk(XENLOG_WARNING,
-                        "Attempt to change unmodifiable CR0 flags.\n");
-                goto fail;
-            }
-            (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
-            break;
-
-        case 2: /* Write CR2 */
-            v->arch.pv_vcpu.ctrlreg[2] = *reg;
-            arch_set_cr2(v, *reg);
-            break;
-
-        case 3: {/* Write CR3 */
-            unsigned long gfn;
-            struct page_info *page;
-
-            gfn = !is_pv_32bit_domain(currd)
-                ? xen_cr3_to_pfn(*reg) : compat_cr3_to_pfn(*reg);
-            page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
-            if ( page )
-            {
-                rc = new_guest_cr3(page_to_mfn(page));
-                put_page(page);
-            }
-            else
-                rc = -EINVAL;
-
-            switch ( rc )
-            {
-            case 0:
-                break;
-            case -ERESTART: /* retry after preemption */
-                goto skip;
-            default:      /* not okay */
-                goto fail;
-            }
+        case X86EMUL_OKAY:
             break;
-        }
-
-        case 4: /* Write CR4 */
-            v->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg);
-            write_cr4(pv_guest_cr4_to_real_cr4(v));
-            ctxt_switch_levelling(v);
-            break;
-
+        case X86EMUL_RETRY: /* retry after preemption */
+            goto skip;
         default:
             goto fail;
         }



[-- Attachment #2: x86-PV-priv-op-split-CR.patch --]
[-- Type: text/plain, Size: 7111 bytes --]

x86/PV: split out dealing with CRn from privileged instruction handling

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2255,6 +2255,107 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Read CR0 */
+        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+        return X86EMUL_OKAY;
+
+    case 2: /* Read CR2 */
+    case 4: /* Read CR4 */
+        *val = curr->arch.pv_vcpu.ctrlreg[reg];
+        return X86EMUL_OKAY;
+
+    case 3: /* Read CR3 */
+    {
+        const struct domain *currd = curr->domain;
+        unsigned long mfn;
+
+        if ( !is_pv_32bit_domain(currd) )
+        {
+            mfn = pagetable_get_pfn(curr->arch.guest_table);
+            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        else
+        {
+            l4_pgentry_t *pl4e =
+                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+            mfn = l4e_get_pfn(*pl4e);
+            unmap_domain_page(pl4e);
+            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        /* PTs should not be shared */
+        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+        return X86EMUL_OKAY;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Write CR0 */
+        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+        {
+            gdprintk(XENLOG_WARNING,
+                    "Attempt to change unmodifiable CR0 flags\n");
+            break;
+        }
+        do_fpu_taskswitch(!!(val & X86_CR0_TS));
+        return X86EMUL_OKAY;
+
+    case 2: /* Write CR2 */
+        curr->arch.pv_vcpu.ctrlreg[2] = val;
+        arch_set_cr2(curr, val);
+        return X86EMUL_OKAY;
+
+    case 3: /* Write CR3 */
+    {
+        struct domain *currd = curr->domain;
+        unsigned long gfn;
+        struct page_info *page;
+        int rc;
+
+        gfn = !is_pv_32bit_domain(currd)
+              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+        if ( !page )
+            break;
+        rc = new_guest_cr3(page_to_mfn(page));
+        put_page(page);
+
+        switch ( rc )
+        {
+        case 0:
+            return X86EMUL_OKAY;
+        case -ERESTART: /* retry after preemption */
+            return X86EMUL_RETRY;
+        }
+        break;
+    }
+
+    case 4: /* Write CR4 */
+        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+        write_cr4(pv_guest_cr4_to_real_cr4(curr));
+        ctxt_switch_levelling(curr);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2667,48 +2768,9 @@ static int emulate_privileged_op(struct
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
-        {
-        case 0: /* Read CR0 */
-            *reg = (read_cr0() & ~X86_CR0_TS) |
-                v->arch.pv_vcpu.ctrlreg[0];
-            break;
-
-        case 2: /* Read CR2 */
-            *reg = v->arch.pv_vcpu.ctrlreg[2];
-            break;
-            
-        case 3: /* Read CR3 */
-        {
-            unsigned long mfn;
-            
-            if ( !is_pv_32bit_domain(currd) )
-            {
-                mfn = pagetable_get_pfn(v->arch.guest_table);
-                *reg = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            else
-            {
-                l4_pgentry_t *pl4e =
-                    map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
-
-                mfn = l4e_get_pfn(*pl4e);
-                unmap_domain_page(pl4e);
-                *reg = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-            }
-            /* PTs should not be shared */
-            BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
-        }
-        break;
-
-        case 4: /* Read CR4 */
-            *reg = v->arch.pv_vcpu.ctrlreg[4];
-            break;
-
-        default:
+        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        }
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
@@ -2732,56 +2794,12 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        switch ( modrm_reg )
+        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
         {
-        case 0: /* Write CR0 */
-            if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
-            {
-                gdprintk(XENLOG_WARNING,
-                        "Attempt to change unmodifiable CR0 flags.\n");
-                goto fail;
-            }
-            (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
-            break;
-
-        case 2: /* Write CR2 */
-            v->arch.pv_vcpu.ctrlreg[2] = *reg;
-            arch_set_cr2(v, *reg);
-            break;
-
-        case 3: {/* Write CR3 */
-            unsigned long gfn;
-            struct page_info *page;
-
-            gfn = !is_pv_32bit_domain(currd)
-                ? xen_cr3_to_pfn(*reg) : compat_cr3_to_pfn(*reg);
-            page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
-            if ( page )
-            {
-                rc = new_guest_cr3(page_to_mfn(page));
-                put_page(page);
-            }
-            else
-                rc = -EINVAL;
-
-            switch ( rc )
-            {
-            case 0:
-                break;
-            case -ERESTART: /* retry after preemption */
-                goto skip;
-            default:      /* not okay */
-                goto fail;
-            }
+        case X86EMUL_OKAY:
             break;
-        }
-
-        case 4: /* Write CR4 */
-            v->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg);
-            write_cr4(pv_guest_cr4_to_real_cr4(v));
-            ctxt_switch_levelling(v);
-            break;
-
+        case X86EMUL_RETRY: /* retry after preemption */
+            goto skip;
         default:
             goto fail;
         }

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 11/16] x86/PV: split out dealing with DRn from privileged instruction handling
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (9 preceding siblings ...)
  2016-09-28  8:14 ` [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
@ 2016-09-28  8:15 ` Jan Beulich
  2016-09-29 20:13   ` Andrew Cooper
  2016-09-28  8:16 ` [PATCH v2 12/16] x86/PV: split out dealing with MSRs " Jan Beulich
                   ` (5 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 2289 bytes --]

This is in preparation for using the generic emulator here.

Some care is needed temporarily to not unduly alter guest register
state: The local variable "res" can only go away once this code got
fully switched over to using x86_emulate().

Also switch to IS_ERR_VALUE() instead of (incorrectly) open coding it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2356,6 +2356,26 @@ static int priv_op_write_cr(unsigned int
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long res = do_get_debugreg(reg);
+
+    if ( IS_ERR_VALUE(res) )
+        return X86EMUL_UNHANDLEABLE;
+
+    *val = res;
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    return do_set_debugreg(reg, val) == 0
+           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2774,16 +2794,14 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
-        unsigned long res;
         opcode = insn_fetch(u8, code_base, eip, code_limit);
         if ( opcode < 0xc0 )
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
+        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        *reg = res;
         break;
     }
 
@@ -2812,7 +2830,7 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        if ( do_set_debugreg(modrm_reg, *reg) != 0 )
+        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
             goto fail;
         break;
 




[-- Attachment #2: x86-PV-priv-op-split-DR.patch --]
[-- Type: text/plain, Size: 2358 bytes --]

x86/PV: split out dealing with DRn from privileged instruction handling

This is in preparation for using the generic emulator here.

Some care is needed temporarily to not unduly alter guest register
state: The local variable "res" can only go away once this code got
fully switched over to using x86_emulate().

Also switch to IS_ERR_VALUE() instead of (incorrectly) open coding it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2356,6 +2356,26 @@ static int priv_op_write_cr(unsigned int
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long res = do_get_debugreg(reg);
+
+    if ( IS_ERR_VALUE(res) )
+        return X86EMUL_UNHANDLEABLE;
+
+    *val = res;
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    return do_set_debugreg(reg, val) == 0
+           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2774,16 +2794,14 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x21: /* MOV DR?,<reg> */ {
-        unsigned long res;
         opcode = insn_fetch(u8, code_base, eip, code_limit);
         if ( opcode < 0xc0 )
             goto fail;
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
+        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
+                             NULL) != X86EMUL_OKAY )
             goto fail;
-        *reg = res;
         break;
     }
 
@@ -2812,7 +2830,7 @@ static int emulate_privileged_op(struct
         modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
         modrm_rm  |= (opcode >> 0) & 7;
         reg = decode_register(modrm_rm, regs, 0);
-        if ( do_set_debugreg(modrm_reg, *reg) != 0 )
+        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
             goto fail;
         break;
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 12/16] x86/PV: split out dealing with MSRs from privileged instruction handling
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (10 preceding siblings ...)
  2016-09-28  8:15 ` [PATCH v2 11/16] x86/PV: split out dealing with DRn " Jan Beulich
@ 2016-09-28  8:16 ` Jan Beulich
  2016-09-29 20:44   ` Andrew Cooper
  2016-09-28  8:17 ` [PATCH v2 13/16] x86emul: support XSETBV Jan Beulich
                   ` (4 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 26185 bytes --]

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Re-base.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2386,6 +2386,345 @@ static inline uint64_t guest_misc_enable
     return val;
 }
 
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdgsbase()
+                                : curr->arch.pv_vcpu.gs_base_kernel;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = curr->arch.pv_vcpu.gs_base_user;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( unlikely(is_cpufreq_controller(currd)) )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_UCODE_REV:
+        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+                break;
+            sync_core();
+        }
+        goto normal;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        *val = guest_misc_enable(*val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[0];
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_PERF_CAPABILITIES:
+        /* No extra capabilities are supported. */
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
+            break;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+            /* fall through */
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                /* Don't leak PMU MSRs to unprivileged domains. */
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    *val = 0;
+                else if ( vpmu_do_rdmsr(reg, val) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( rdmsr_hypervisor_regs(reg, val) )
+            return X86EMUL_OKAY;
+
+        rc = vmce_rdmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+        /* fall through */
+    case MSR_EFER:
+    normal:
+        /* Everyone can read the MSR space. */
+        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+#include "x86_64/mmconfig.h"
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        uint64_t temp;
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrfsbase(val);
+        curr->arch.pv_vcpu.fs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrgsbase(val);
+        curr->arch.pv_vcpu.gs_base_kernel = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) ||
+             wrmsr_safe(MSR_SHADOW_GS_BASE, val) )
+            break;
+        curr->arch.pv_vcpu.gs_base_user = val;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+    case MSR_K8_HWCR:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_NB_CFG:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_FAM10H_MMIO_CONF_BASE:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+            break;
+        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+             temp != val :
+             ((temp ^ val) &
+              ~(FAM10H_MMIO_CONF_ENABLE |
+                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_UCODE_REV:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val != guest_misc_enable(temp) )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MPERF:
+    case MSR_IA32_APERF:
+        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_PERF_CTL:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_THERM_CONTROL:
+    case MSR_IA32_ENERGY_PERF_BIAS:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[0] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(reg, val);
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
+            break;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    return X86EMUL_OKAY;
+
+                if ( vpmu_do_wrmsr(reg, val, 0) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+            return X86EMUL_OKAY;
+
+        rc = vmce_wrmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+
+        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+    invalid:
+            gdprintk(XENLOG_WARNING,
+                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+                     reg, temp, val);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 /* Instruction fetch with error handling. */
 #define insn_fetch(type, base, eip, limit)                                  \
 ({  unsigned long _rc, _ptr = (base) + (eip);                               \
@@ -2401,14 +2740,6 @@ static inline uint64_t guest_misc_enable
     }                                                                       \
     (eip) += sizeof(_x); _x; })
 
-static int is_cpufreq_controller(struct domain *d)
-{
-    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
-            is_hardware_domain(d));
-}
-
-#include "x86_64/mmconfig.h"
-
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -2433,7 +2764,6 @@ static int emulate_privileged_op(struct
     char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *);
     uint64_t val;
-    bool_t vpmu_msr;
 
     if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
         goto fail;
@@ -2834,196 +3164,11 @@ static int emulate_privileged_op(struct
             goto fail;
         break;
 
-    case 0x30: /* WRMSR */ {
-        uint32_t eax = regs->eax;
-        uint32_t edx = regs->edx;
-        uint64_t msr_content = ((uint64_t)edx << 32) | eax;
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrfsbase(msr_content);
-            v->arch.pv_vcpu.fs_base = msr_content;
-            break;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrgsbase(msr_content);
-            v->arch.pv_vcpu.gs_base_kernel = msr_content;
-            break;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) )
-                goto fail;
-            v->arch.pv_vcpu.gs_base_user = msr_content;
-            break;
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-        case MSR_K8_HWCR:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_AMD64_NB_CFG:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) ||
-                 (eax != (uint32_t)val) ||
-                 ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_FAM10H_MMIO_CONF_BASE:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) )
-                goto fail;
-            if (
-                 (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
-                 val != msr_content :
-                 ((val ^ msr_content) &
-                  ~( FAM10H_MMIO_CONF_ENABLE |
-                    (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
-                     FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
-                    ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
-                     FAM10H_MMIO_CONF_BASE_SHIFT))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_UCODE_REV:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            if ( msr_content )
-                goto invalid;
-            break;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            if ( msr_content != val )
-                goto invalid;
-            break;
-        case MSR_IA32_MPERF:
-        case MSR_IA32_APERF:
-            if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
-                ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content ) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_PERF_CTL:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_THERM_CONTROL:
-        case MSR_IA32_ENERGY_PERF_BIAS:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask[0] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask
-                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(regs->_ecx, msr_content);
-            break;
-
-        case MSR_INTEL_PLATFORM_INFO:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-                 msr_content ||
-                 rdmsr_safe(MSR_INTEL_PLATFORM_INFO, msr_content) )
-                goto fail;
-            break;
-
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                        break;
-
-                    if ( vpmu_do_wrmsr(regs->ecx, msr_content, 0) )
-                        goto fail;
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
-                break;
-
-            rc = vmce_wrmsr(regs->ecx, msr_content);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                break;
-
-            if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) )
-        invalid:
-                gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
-                        "0x%016"PRIx64" to 0x%016"PRIx64".\n",
-                        _p(regs->ecx), val, msr_content);
-            break;
-        }
+    case 0x30: /* WRMSR */
+        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
+                               NULL) != X86EMUL_OKAY )
+            goto fail;
         break;
-    }
 
     case 0x31: /* RDTSC */
         if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
@@ -3039,138 +3184,11 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x32: /* RDMSR */
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
-            goto rdmsr_writeback;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdgsbase()
-                                   : v->arch.pv_vcpu.gs_base_kernel;
-            goto rdmsr_writeback;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = v->arch.pv_vcpu.gs_base_user;
-            goto rdmsr_writeback;
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-            {
-                regs->eax = regs->edx = 0;
-                break;
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_UCODE_REV:
-            BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
-                    goto fail;
-                sync_core();
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            goto rdmsr_writeback;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask[0];
-            regs->edx = 0;
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask
-                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
-            regs->edx = 0;
-            break;
-        case MSR_IA32_PERF_CAPABILITIES:
-            /* No extra capabilities are supported */
-            regs->eax = regs->edx = 0;
-            break;
-
-        case MSR_INTEL_PLATFORM_INFO:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-                 rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
-                goto fail;
-            regs->eax = regs->edx = 0;
-            break;
-
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                    {
-                        /* Don't leak PMU MSRs to unprivileged domains */
-                        regs->eax = regs->edx = 0;
-                        break;
-                    }
-
-                    if ( vpmu_do_rdmsr(regs->ecx, &val) )
-                        goto fail;
-
-                    regs->eax = (uint32_t)val;
-                    regs->edx = (uint32_t)(val >> 32);
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
-                goto rdmsr_writeback;
-
-            rc = vmce_rdmsr(regs->ecx, &val);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                goto rdmsr_writeback;
-
-        case MSR_EFER:
- rdmsr_normal:
-            /* Everyone can read the MSR space. */
-            /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
-                        _p(regs->ecx));*/
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
+        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
+            goto fail;
  rdmsr_writeback:
-            regs->eax = (uint32_t)val;
-            regs->edx = (uint32_t)(val >> 32);
-            break;
-        }
+        regs->eax = (uint32_t)val;
+        regs->edx = (uint32_t)(val >> 32);
         break;
 
     case 0xa2: /* CPUID */



[-- Attachment #2: x86-PV-priv-op-split-MSR.patch --]
[-- Type: text/plain, Size: 26257 bytes --]

x86/PV: split out dealing with MSRs from privileged instruction handling

This is in preparation for using the generic emulator here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Re-base.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2386,6 +2386,345 @@ static inline uint64_t guest_misc_enable
     return val;
 }
 
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdgsbase()
+                                : curr->arch.pv_vcpu.gs_base_kernel;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = curr->arch.pv_vcpu.gs_base_user;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( unlikely(is_cpufreq_controller(currd)) )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_UCODE_REV:
+        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+                break;
+            sync_core();
+        }
+        goto normal;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        *val = guest_misc_enable(*val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[0];
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_PERF_CAPABILITIES:
+        /* No extra capabilities are supported. */
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
+            break;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+            /* fall through */
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                /* Don't leak PMU MSRs to unprivileged domains. */
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    *val = 0;
+                else if ( vpmu_do_rdmsr(reg, val) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( rdmsr_hypervisor_regs(reg, val) )
+            return X86EMUL_OKAY;
+
+        rc = vmce_rdmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+        /* fall through */
+    case MSR_EFER:
+    normal:
+        /* Everyone can read the MSR space. */
+        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+#include "x86_64/mmconfig.h"
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        uint64_t temp;
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrfsbase(val);
+        curr->arch.pv_vcpu.fs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        wrgsbase(val);
+        curr->arch.pv_vcpu.gs_base_kernel = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) ||
+             wrmsr_safe(MSR_SHADOW_GS_BASE, val) )
+            break;
+        curr->arch.pv_vcpu.gs_base_user = val;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+    case MSR_K8_HWCR:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_NB_CFG:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_FAM10H_MMIO_CONF_BASE:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+            break;
+        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+             temp != val :
+             ((temp ^ val) &
+              ~(FAM10H_MMIO_CONF_ENABLE |
+                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_UCODE_REV:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val != guest_misc_enable(temp) )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MPERF:
+    case MSR_IA32_APERF:
+        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_PERF_CTL:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_THERM_CONTROL:
+    case MSR_IA32_ENERGY_PERF_BIAS:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[0] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(reg, val);
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
+            break;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    return X86EMUL_OKAY;
+
+                if ( vpmu_do_wrmsr(reg, val, 0) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+            return X86EMUL_OKAY;
+
+        rc = vmce_wrmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+
+        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+    invalid:
+            gdprintk(XENLOG_WARNING,
+                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+                     reg, temp, val);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
 /* Instruction fetch with error handling. */
 #define insn_fetch(type, base, eip, limit)                                  \
 ({  unsigned long _rc, _ptr = (base) + (eip);                               \
@@ -2401,14 +2740,6 @@ static inline uint64_t guest_misc_enable
     }                                                                       \
     (eip) += sizeof(_x); _x; })
 
-static int is_cpufreq_controller(struct domain *d)
-{
-    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
-            is_hardware_domain(d));
-}
-
-#include "x86_64/mmconfig.h"
-
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -2433,7 +2764,6 @@ static int emulate_privileged_op(struct
     char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *);
     uint64_t val;
-    bool_t vpmu_msr;
 
     if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
         goto fail;
@@ -2834,196 +3164,11 @@ static int emulate_privileged_op(struct
             goto fail;
         break;
 
-    case 0x30: /* WRMSR */ {
-        uint32_t eax = regs->eax;
-        uint32_t edx = regs->edx;
-        uint64_t msr_content = ((uint64_t)edx << 32) | eax;
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrfsbase(msr_content);
-            v->arch.pv_vcpu.fs_base = msr_content;
-            break;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            wrgsbase(msr_content);
-            v->arch.pv_vcpu.gs_base_kernel = msr_content;
-            break;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) )
-                goto fail;
-            v->arch.pv_vcpu.gs_base_user = msr_content;
-            break;
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-        case MSR_K8_HWCR:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_AMD64_NB_CFG:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) ||
-                 (eax != (uint32_t)val) ||
-                 ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_FAM10H_MMIO_CONF_BASE:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) )
-                goto fail;
-            if (
-                 (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
-                 val != msr_content :
-                 ((val ^ msr_content) &
-                  ~( FAM10H_MMIO_CONF_ENABLE |
-                    (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
-                     FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
-                    ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
-                     FAM10H_MMIO_CONF_BASE_SHIFT))) )
-                goto invalid;
-            if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_UCODE_REV:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            if ( msr_content )
-                goto invalid;
-            break;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            if ( msr_content != val )
-                goto invalid;
-            break;
-        case MSR_IA32_MPERF:
-        case MSR_IA32_APERF:
-            if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
-                ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content ) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_PERF_CTL:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-        case MSR_IA32_THERM_CONTROL:
-        case MSR_IA32_ENERGY_PERF_BIAS:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                goto fail;
-            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
-                break;
-            if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
-                goto fail;
-            break;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask[0] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
-                goto fail;
-            v->arch.pv_vcpu.dr_mask
-                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
-            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
-                wrmsrl(regs->_ecx, msr_content);
-            break;
-
-        case MSR_INTEL_PLATFORM_INFO:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-                 msr_content ||
-                 rdmsr_safe(MSR_INTEL_PLATFORM_INFO, msr_content) )
-                goto fail;
-            break;
-
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                        break;
-
-                    if ( vpmu_do_wrmsr(regs->ecx, msr_content, 0) )
-                        goto fail;
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
-                break;
-
-            rc = vmce_wrmsr(regs->ecx, msr_content);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                break;
-
-            if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) )
-        invalid:
-                gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
-                        "0x%016"PRIx64" to 0x%016"PRIx64".\n",
-                        _p(regs->ecx), val, msr_content);
-            break;
-        }
+    case 0x30: /* WRMSR */
+        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
+                               NULL) != X86EMUL_OKAY )
+            goto fail;
         break;
-    }
 
     case 0x31: /* RDTSC */
         if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
@@ -3039,138 +3184,11 @@ static int emulate_privileged_op(struct
         break;
 
     case 0x32: /* RDMSR */
-        vpmu_msr = 0;
-        switch ( regs->_ecx )
-        {
-        case MSR_FS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
-            goto rdmsr_writeback;
-        case MSR_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = cpu_has_fsgsbase ? __rdgsbase()
-                                   : v->arch.pv_vcpu.gs_base_kernel;
-            goto rdmsr_writeback;
-        case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32bit_domain(currd) )
-                goto fail;
-            val = v->arch.pv_vcpu.gs_base_user;
-            goto rdmsr_writeback;
-        case MSR_K7_FID_VID_CTL:
-        case MSR_K7_FID_VID_STATUS:
-        case MSR_K8_PSTATE_LIMIT:
-        case MSR_K8_PSTATE_CTRL:
-        case MSR_K8_PSTATE_STATUS:
-        case MSR_K8_PSTATE0:
-        case MSR_K8_PSTATE1:
-        case MSR_K8_PSTATE2:
-        case MSR_K8_PSTATE3:
-        case MSR_K8_PSTATE4:
-        case MSR_K8_PSTATE5:
-        case MSR_K8_PSTATE6:
-        case MSR_K8_PSTATE7:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-                goto fail;
-            if ( !is_cpufreq_controller(currd) )
-            {
-                regs->eax = regs->edx = 0;
-                break;
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_UCODE_REV:
-            BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
-                    goto fail;
-                sync_core();
-            }
-            goto rdmsr_normal;
-        case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
-            val = guest_misc_enable(val);
-            goto rdmsr_writeback;
-
-        case MSR_AMD64_DR0_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask[0];
-            regs->edx = 0;
-            break;
-        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-                goto fail;
-            regs->eax = v->arch.pv_vcpu.dr_mask
-                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
-            regs->edx = 0;
-            break;
-        case MSR_IA32_PERF_CAPABILITIES:
-            /* No extra capabilities are supported */
-            regs->eax = regs->edx = 0;
-            break;
-
-        case MSR_INTEL_PLATFORM_INFO:
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-                 rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
-                goto fail;
-            regs->eax = regs->edx = 0;
-            break;
-
-        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-            {
-                vpmu_msr = 1;
-        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-        case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-                {
-
-                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                         !is_hardware_domain(v->domain) )
-                    {
-                        /* Don't leak PMU MSRs to unprivileged domains */
-                        regs->eax = regs->edx = 0;
-                        break;
-                    }
-
-                    if ( vpmu_do_rdmsr(regs->ecx, &val) )
-                        goto fail;
-
-                    regs->eax = (uint32_t)val;
-                    regs->edx = (uint32_t)(val >> 32);
-                    break;
-                }
-            }
-            /*FALLTHROUGH*/
-
-        default:
-            if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
-                goto rdmsr_writeback;
-
-            rc = vmce_rdmsr(regs->ecx, &val);
-            if ( rc < 0 )
-                goto fail;
-            if ( rc )
-                goto rdmsr_writeback;
-
-        case MSR_EFER:
- rdmsr_normal:
-            /* Everyone can read the MSR space. */
-            /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
-                        _p(regs->ecx));*/
-            if ( rdmsr_safe(regs->ecx, val) )
-                goto fail;
+        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
+            goto fail;
  rdmsr_writeback:
-            regs->eax = (uint32_t)val;
-            regs->edx = (uint32_t)(val >> 32);
-            break;
-        }
+        regs->eax = (uint32_t)val;
+        regs->edx = (uint32_t)(val >> 32);
         break;
 
     case 0xa2: /* CPUID */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 13/16] x86emul: support XSETBV
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (11 preceding siblings ...)
  2016-09-28  8:16 ` [PATCH v2 12/16] x86/PV: split out dealing with MSRs " Jan Beulich
@ 2016-09-28  8:17 ` Jan Beulich
  2016-09-29 20:45   ` Andrew Cooper
  2016-09-28  8:18 ` [PATCH v2 14/16] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
                   ` (3 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1313 bytes --]

This is a prereq for switching PV privileged op emulation to the
generic instruction emulator. Since handle_xsetbv() is already capable
of dealing with all guest kinds, avoid introducing another hook here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Explicitly generate #UD when vex.pfx is non-zero.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4178,6 +4178,23 @@ x86_emulate(
 
         switch( modrm )
         {
+#ifdef __XEN__
+        case 0xd1: /* xsetbv */
+        {
+            unsigned long cr4;
+
+            generate_exception_if(vex.pfx, EXC_UD, -1);
+            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD, -1);
+            generate_exception_if(!mode_ring0() ||
+                                  handle_xsetbv(_regs._ecx,
+                                                _regs._eax | (_regs.rdx << 32)),
+                                  EXC_GP, 0);
+            goto no_writeback;
+        }
+#endif
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);




[-- Attachment #2: x86emul-XSETBV.patch --]
[-- Type: text/plain, Size: 1334 bytes --]

x86emul: support XSETBV

This is a prereq for switching PV privileged op emulation to the
generic instruction emulator. Since handle_xsetbv() is already capable
of dealing with all guest kinds, avoid introducing another hook here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Explicitly generate #UD when vex.pfx is non-zero.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4178,6 +4178,23 @@ x86_emulate(
 
         switch( modrm )
         {
+#ifdef __XEN__
+        case 0xd1: /* xsetbv */
+        {
+            unsigned long cr4;
+
+            generate_exception_if(vex.pfx, EXC_UD, -1);
+            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD, -1);
+            generate_exception_if(!mode_ring0() ||
+                                  handle_xsetbv(_regs._ecx,
+                                                _regs._eax | (_regs.rdx << 32)),
+                                  EXC_GP, 0);
+            goto no_writeback;
+        }
+#endif
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 14/16] x86emul: sort opcode 0f01 special case switch() statement
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (12 preceding siblings ...)
  2016-09-28  8:17 ` [PATCH v2 13/16] x86emul: support XSETBV Jan Beulich
@ 2016-09-28  8:18 ` Jan Beulich
  2016-09-29 20:46   ` Andrew Cooper
  2016-09-28  8:18 ` [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
                   ` (2 subsequent siblings)
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:18 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1894 bytes --]

Sort the special case opcode 0f01 entries numerically, insert blank
lines between each of the cases, and properly place opening braces.

No functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4195,6 +4195,14 @@ x86_emulate(
         }
 #endif
 
+        case 0xd4: /* vmfunc */
+            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
+                                  EXC_UD, -1);
+            fail_if(!ops->vmfunc);
+            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
+                goto done;
+            goto no_writeback;
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4203,7 +4211,9 @@ x86_emulate(
                                    ctxt)) )
                 goto done;
             goto no_writeback;
-        case 0xf9: /* rdtscp */ {
+
+        case 0xf9: /* rdtscp */
+        {
             uint64_t tsc_aux;
             fail_if(ops->read_msr == NULL);
             if ( (rc = ops->read_msr(MSR_TSC_AUX, &tsc_aux, ctxt)) != 0 )
@@ -4211,14 +4221,9 @@ x86_emulate(
             _regs.ecx = (uint32_t)tsc_aux;
             goto rdtsc;
         }
-        case 0xd4: /* vmfunc */
-            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
-                                  EXC_UD, -1);
-            fail_if(ops->vmfunc == NULL);
-            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
-                goto done;
-            goto no_writeback;
-	case 0xfc: /* clzero */ {
+
+        case 0xfc: /* clzero */
+        {
             unsigned int eax = 1, ebx = 0, dummy = 0;
             unsigned long zero = 0;
 




[-- Attachment #2: x86emul-sort-0f01.patch --]
[-- Type: text/plain, Size: 1949 bytes --]

x86emul: sort opcode 0f01 special case switch() statement

Sort the special case opcode 0f01 entries numerically, insert blank
lines between each of the cases, and properly place opening braces.

No functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4195,6 +4195,14 @@ x86_emulate(
         }
 #endif
 
+        case 0xd4: /* vmfunc */
+            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
+                                  EXC_UD, -1);
+            fail_if(!ops->vmfunc);
+            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
+                goto done;
+            goto no_writeback;
+
         case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);
@@ -4203,7 +4211,9 @@ x86_emulate(
                                    ctxt)) )
                 goto done;
             goto no_writeback;
-        case 0xf9: /* rdtscp */ {
+
+        case 0xf9: /* rdtscp */
+        {
             uint64_t tsc_aux;
             fail_if(ops->read_msr == NULL);
             if ( (rc = ops->read_msr(MSR_TSC_AUX, &tsc_aux, ctxt)) != 0 )
@@ -4211,14 +4221,9 @@ x86_emulate(
             _regs.ecx = (uint32_t)tsc_aux;
             goto rdtsc;
         }
-        case 0xd4: /* vmfunc */
-            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
-                                  EXC_UD, -1);
-            fail_if(ops->vmfunc == NULL);
-            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
-                goto done;
-            goto no_writeback;
-	case 0xfc: /* clzero */ {
+
+        case 0xfc: /* clzero */
+        {
             unsigned int eax = 1, ebx = 0, dummy = 0;
             unsigned long zero = 0;
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (13 preceding siblings ...)
  2016-09-28  8:18 ` [PATCH v2 14/16] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
@ 2016-09-28  8:18 ` Jan Beulich
  2016-09-29 21:06   ` Andrew Cooper
  2016-09-28  8:19 ` [PATCH v2 16/16] x86emul: don't assume a memory operand Jan Beulich
  2016-09-28  8:42 ` [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:18 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 47855 bytes --]

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment). Its handling in the epilogue
code involves moving the raising of the single step trap until after
registers were updated. This should probably have been that way from
the beginning, to allow the inject_hw_exception() hook to see updated
register state (in case it cares) - it's a trap, after all.

The other small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too. The read() / write() hook pointers get checked for that
purpose.

And finally handling of exceptions gets changed for REP INS / REP OUTS:
If the hook return X86EMUL_EXCEPTION, register state will still get
updated if some iterations have been performed (but the rIP update will
get suppressed if not all of them did get handled). While on the HVM side
the VA -> LA -> PA translation process clips the number of repetitions,
doing so would unduly complicate the PV side code being added here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
One thing to be considered is that despite avoiding the handling of
memory reads and writes (other than for INS and OUTS) the set of insns
now getting potentially handled by the emulator is much larger than
before. A possible solution to this would be a new hook to be called
between decode and execution stages, allowing further restrictions to
be enforced. Of course this could easily be a follow-up patch, as the
one here is quite big already.

Another thing to consider is to the extend the X86EMUL_EXCEPTION
handling change mentioned above to other string instructions. In that
case this should probably be broken out into a prereq patch.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -20,6 +20,9 @@ typedef bool bool_t;
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
+#define likely(x)   __builtin_expect(!!(x), true)
+#define unlikely(x) __builtin_expect(!!(x), false)
+
 #define __packed __attribute__((packed))
 
 /* For generic assembly code: use macros to define operation/operand sizes. */
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         hvm_inject_page_fault(pfec, addr);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 hvm_inject_page_fault(pfec, addr & PAGE_MASK);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -568,6 +569,7 @@ static int hvmemul_virtual_to_linear(
         return X86EMUL_UNHANDLEABLE;
 
     /* This is a singleton operation: fail it with an exception. */
+    *reps = 0;
     hvmemul_ctxt->exn_pending = 1;
     hvmemul_ctxt->trap.vector =
         (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -659,16 +659,13 @@ static void do_guest_trap(unsigned int t
                 trapstr(trapnr), trapnr, regs->error_code);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1292,7 +1289,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1325,7 +1322,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -2009,6 +2006,154 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static bool priv_op_to_linear(unsigned long base, unsigned long offset,
+                              unsigned int bytes, unsigned long limit,
+                              enum x86_segment seg,
+                              const struct x86_emulate_ctxt *ctxt,
+                              unsigned long *addr)
+{
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        if ( unlikely(limit < bytes - 1) ||
+             unlikely(offset > limit - bytes + 1) )
+        {
+            do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                          ctxt->regs);
+            return false;
+        }
+
+        *addr = (uint32_t)*addr;
+    }
+    else if ( unlikely(!__addr_ok(*addr)) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return false;
+    }
+
+    return true;
+}
+
+static int priv_op_insn_fetch(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                            x86_seg_cs, ctxt, &addr) )
+        return X86EMUL_EXCEPTION;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             cpu_has_nx ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default:
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.l   = 1;
+        reg->attr.fields.db  = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2255,6 +2400,234 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        do_guest_trap(TRAP_gp_fault, ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                x86_seg_es, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, PFEC_write_access);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                seg, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, 0);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2395,6 +2768,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2422,6 +2796,22 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior we defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2725,493 +3115,170 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        propagate_page_fault(_ptr + sizeof(_x) - _rc, 0);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
 
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
-    {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
+    return X86EMUL_OKAY;
+}
 
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
+static int priv_op_cpuid(unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct cpu_user_regs regs = *ctxt->regs;
+
+    regs._eax = *eax;
+    regs._ebx = *ebx;
+    regs._ecx = *ecx;
+    regs._edx = *edx;
+
+    pv_cpuid(&regs);
+
+    *eax = regs._eax;
+    *ebx = regs._ebx;
+    *ecx = regs._ecx;
+    *edx = regs._edx;
 
-        port = (u16)regs->edx;
+    return X86EMUL_OKAY;
+}
 
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
-                                     PFEC_write_access);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
+static int priv_op_hw_exception(uint8_t vector, int32_t error_code,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    do_guest_trap(vector, ctxt->regs);
 
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(esi)
-                                     + op_bytes - rc, 0);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
-        }
+    if ( error_code >= 0 )
+    {
+        struct trap_bounce *tb = &current->arch.pv_vcpu.trap_bounce;
 
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
+        tb->flags |= TBF_EXCEPTION_ERRCODE;
+        tb->error_code = error_code;
+    }
+
+    return X86EMUL_EXCEPTION;
+}
+
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .write               = x86emul_unhandleable_rw,
+    .cmpxchg             = x86emul_unhandleable_cx,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = priv_op_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+    .inject_hw_exception = priv_op_hw_exception,
+};
 
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-        goto done;
-    }
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
     /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
+     * Don't have x86_emulate() inject single step traps, as we want #DB
+     * also delivered for I/O break points (see below).
      */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+    if ( regs->_eflags & X86_EFLAGS_TF )
+    {
+        ctxt.bpmatch = DR_STEP;
+        regs->_eflags &= ~X86_EFLAGS_TF;
+    }
+    eflags = regs->_eflags;
 
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
+    /* Un-mirror virtualized state from EFLAGS. */
+    if ( (regs->_eflags ^ eflags) & X86_EFLAGS_IF )
     {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
+        /* The only allowed insns altering EFLAGS.IF are CLI/STI. */
+        ASSERT((ctxt.ctxt.opcode & ~1) == 0xfa);
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(curr, evtchn_upcall_mask) = (opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
     }
-
-    /* No decode of this single-byte opcode. */
-    goto fail;
-
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
-            break;
-        case 0xd1: /* XSETBV */
-        {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
-
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
+    regs->_eflags |= X86_EFLAGS_IF;
+    /* Nothing we allow to be emulated can change IOPL or TF. */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IOPL | X86_EFLAGS_TF)));
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+    if ( ctxt.bpmatch & DR_STEP )
+        regs->_eflags |= X86_EFLAGS_TF;
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
             {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
-
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
-
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
-
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+                uint64_t val = rdtsc();
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
-
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
-
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
-
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        pv_cpuid(regs);
-        break;
-
-    default:
-        goto fail;
+        /* fall through */
+    case X86EMUL_RETRY:
+    case X86EMUL_EXCEPTION:
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3541,7 +3609,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,7 +857,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -1075,7 +1079,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc != X86EMUL_DONE ? rc : X86EMUL_OKAY;
 
     /* Ensure that the TSS is valid and has an io-bitmap-offset field. */
     if ( !tr.attr.fields.p ||
@@ -1610,6 +1614,17 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_UNHANDLEABLE;
+}
+
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
@@ -2280,6 +2295,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
+    bool tf = ctxt->regs->eflags & EFLG_TF;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     enum x86_swint_type swint_type;
@@ -2731,14 +2747,10 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( ((nr_reps == 1) && (ops->write != x86emul_unhandleable_rw)) ||
+             !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2750,6 +2762,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2760,14 +2774,10 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( ((nr_reps == 1) && (ops->read != x86emul_unhandleable_rw)) ||
+             !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2781,6 +2791,8 @@ x86_emulate(
             _regs.esi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3038,6 +3050,7 @@ x86_emulate(
             dst.val = _regs.eax;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
         else if ( rc != X86EMUL_OKAY )
             goto done;
@@ -3846,7 +3859,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5198,11 +5215,6 @@ x86_emulate(
     }
 
  no_writeback:
-    /* Inject #DB if single-step tracing was enabled at instruction start. */
-    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
-         (ops->inject_hw_exception != NULL) )
-        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
-
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
@@ -5210,7 +5222,18 @@ x86_emulate(
     if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip    = _regs.eip;
+        ctxt->regs->eflags = _regs.eflags;
+        rc = X86EMUL_OKAY;
+    }
+
+    /* Inject #DB if single-step tracing was enabled at instruction start. */
+    if ( tf && (rc == X86EMUL_OKAY) && ops->inject_hw_exception )
+        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -111,6 +111,13 @@ struct __packed segment_register {
 #define X86EMUL_RETRY          3
  /* (cmpxchg accessor): CMPXCHG failed. Maps to X86EMUL_RETRY in caller. */
 #define X86EMUL_CMPXCHG_FAILED 3
+ /*
+  * Operation fully done by one of the hooks:
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when use anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -531,6 +538,15 @@ x86emul_unhandleable_rw(
     void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
+/* Unhandleable cmpxchg */
+int
+x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt);
 
 #ifdef __XEN__
 



[-- Attachment #2: x86-PV-priv-op-generic-emul.patch --]
[-- Type: text/plain, Size: 47919 bytes --]

x86/PV: use generic emulator for privileged instruction handling

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment). Its handling in the epilogue
code involves moving the raising of the single step trap until after
registers were updated. This should probably have been that way from
the beginning, to allow the inject_hw_exception() hook to see updated
register state (in case it cares) - it's a trap, after all.

The other small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too. The read() / write() hook pointers get checked for that
purpose.

And finally handling of exceptions gets changed for REP INS / REP OUTS:
If the hook return X86EMUL_EXCEPTION, register state will still get
updated if some iterations have been performed (but the rIP update will
get suppressed if not all of them did get handled). While on the HVM side
the VA -> LA -> PA translation process clips the number of repetitions,
doing so would unduly complicate the PV side code being added here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
One thing to be considered is that despite avoiding the handling of
memory reads and writes (other than for INS and OUTS) the set of insns
now getting potentially handled by the emulator is much larger than
before. A possible solution to this would be a new hook to be called
between decode and execution stages, allowing further restrictions to
be enforced. Of course this could easily be a follow-up patch, as the
one here is quite big already.

Another thing to consider is to the extend the X86EMUL_EXCEPTION
handling change mentioned above to other string instructions. In that
case this should probably be broken out into a prereq patch.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -20,6 +20,9 @@ typedef bool bool_t;
 #define cpu_has_amd_erratum(nr) 0
 #define mark_regs_dirty(r) ((void)(r))
 
+#define likely(x)   __builtin_expect(!!(x), true)
+#define unlikely(x) __builtin_expect(!!(x), false)
+
 #define __packed __attribute__((packed))
 
 /* For generic assembly code: use macros to define operation/operand sizes. */
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         hvm_inject_page_fault(pfec, addr);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 hvm_inject_page_fault(pfec, addr & PAGE_MASK);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -568,6 +569,7 @@ static int hvmemul_virtual_to_linear(
         return X86EMUL_UNHANDLEABLE;
 
     /* This is a singleton operation: fail it with an exception. */
+    *reps = 0;
     hvmemul_ctxt->exn_pending = 1;
     hvmemul_ctxt->trap.vector =
         (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -659,16 +659,13 @@ static void do_guest_trap(unsigned int t
                 trapstr(trapnr), trapnr, regs->error_code);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1292,7 +1289,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1325,7 +1322,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -2009,6 +2006,154 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static bool priv_op_to_linear(unsigned long base, unsigned long offset,
+                              unsigned int bytes, unsigned long limit,
+                              enum x86_segment seg,
+                              const struct x86_emulate_ctxt *ctxt,
+                              unsigned long *addr)
+{
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        if ( unlikely(limit < bytes - 1) ||
+             unlikely(offset > limit - bytes + 1) )
+        {
+            do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                          ctxt->regs);
+            return false;
+        }
+
+        *addr = (uint32_t)*addr;
+    }
+    else if ( unlikely(!__addr_ok(*addr)) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return false;
+    }
+
+    return true;
+}
+
+static int priv_op_insn_fetch(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                            x86_seg_cs, ctxt, &addr) )
+        return X86EMUL_EXCEPTION;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        propagate_page_fault(addr + bytes - rc,
+                             cpu_has_nx ? PFEC_insn_fetch : 0 );
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default:
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.l   = 1;
+        reg->attr.fields.db  = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2255,6 +2400,234 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        do_guest_trap(TRAP_gp_fault, ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                x86_seg_es, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, PFEC_write_access);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error,
+                      ctxt->regs);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                seg, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            propagate_page_fault(addr + bytes_per_rep - rc, 0);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2395,6 +2768,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2422,6 +2796,22 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior we defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2725,493 +3115,170 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        propagate_page_fault(_ptr + sizeof(_x) - _rc, 0);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
 
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
-    {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
+    return X86EMUL_OKAY;
+}
 
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
+static int priv_op_cpuid(unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct cpu_user_regs regs = *ctxt->regs;
+
+    regs._eax = *eax;
+    regs._ebx = *ebx;
+    regs._ecx = *ecx;
+    regs._edx = *edx;
+
+    pv_cpuid(&regs);
+
+    *eax = regs._eax;
+    *ebx = regs._ebx;
+    *ecx = regs._ecx;
+    *edx = regs._edx;
 
-        port = (u16)regs->edx;
+    return X86EMUL_OKAY;
+}
 
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
-                                     PFEC_write_access);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
+static int priv_op_hw_exception(uint8_t vector, int32_t error_code,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    do_guest_trap(vector, ctxt->regs);
 
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                propagate_page_fault(data_base + rd_ad(esi)
-                                     + op_bytes - rc, 0);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
-        }
+    if ( error_code >= 0 )
+    {
+        struct trap_bounce *tb = &current->arch.pv_vcpu.trap_bounce;
 
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
+        tb->flags |= TBF_EXCEPTION_ERRCODE;
+        tb->error_code = error_code;
+    }
+
+    return X86EMUL_EXCEPTION;
+}
+
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .write               = x86emul_unhandleable_rw,
+    .cmpxchg             = x86emul_unhandleable_cx,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = priv_op_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+    .inject_hw_exception = priv_op_hw_exception,
+};
 
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-        goto done;
-    }
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
     /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
+     * Don't have x86_emulate() inject single step traps, as we want #DB
+     * also delivered for I/O break points (see below).
      */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+    if ( regs->_eflags & X86_EFLAGS_TF )
+    {
+        ctxt.bpmatch = DR_STEP;
+        regs->_eflags &= ~X86_EFLAGS_TF;
+    }
+    eflags = regs->_eflags;
 
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
+    /* Un-mirror virtualized state from EFLAGS. */
+    if ( (regs->_eflags ^ eflags) & X86_EFLAGS_IF )
     {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
+        /* The only allowed insns altering EFLAGS.IF are CLI/STI. */
+        ASSERT((ctxt.ctxt.opcode & ~1) == 0xfa);
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(curr, evtchn_upcall_mask) = (opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
     }
-
-    /* No decode of this single-byte opcode. */
-    goto fail;
-
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
-            break;
-        case 0xd1: /* XSETBV */
-        {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
-
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
+    regs->_eflags |= X86_EFLAGS_IF;
+    /* Nothing we allow to be emulated can change IOPL or TF. */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IOPL | X86_EFLAGS_TF)));
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+    if ( ctxt.bpmatch & DR_STEP )
+        regs->_eflags |= X86_EFLAGS_TF;
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
             {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
-
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
-
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
-
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+                uint64_t val = rdtsc();
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
-
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
-
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
-
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        pv_cpuid(regs);
-        break;
-
-    default:
-        goto fail;
+        /* fall through */
+    case X86EMUL_RETRY:
+    case X86EMUL_EXCEPTION:
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3541,7 +3609,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,7 +857,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -1075,7 +1079,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc != X86EMUL_DONE ? rc : X86EMUL_OKAY;
 
     /* Ensure that the TSS is valid and has an io-bitmap-offset field. */
     if ( !tr.attr.fields.p ||
@@ -1610,6 +1614,17 @@ int x86emul_unhandleable_rw(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_UNHANDLEABLE;
+}
+
 struct x86_emulate_state {
     unsigned int op_bytes, ad_bytes;
 
@@ -2280,6 +2295,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d;
+    bool tf = ctxt->regs->eflags & EFLG_TF;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     enum x86_swint_type swint_type;
@@ -2731,14 +2747,10 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( ((nr_reps == 1) && (ops->write != x86emul_unhandleable_rw)) ||
+             !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2750,6 +2762,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2760,14 +2774,10 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( ((nr_reps == 1) && (ops->read != x86emul_unhandleable_rw)) ||
+             !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2781,6 +2791,8 @@ x86_emulate(
             _regs.esi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3038,6 +3050,7 @@ x86_emulate(
             dst.val = _regs.eax;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
         else if ( rc != X86EMUL_OKAY )
             goto done;
@@ -3846,7 +3859,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5198,11 +5215,6 @@ x86_emulate(
     }
 
  no_writeback:
-    /* Inject #DB if single-step tracing was enabled at instruction start. */
-    if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
-         (ops->inject_hw_exception != NULL) )
-        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
-
     /* Commit shadow register state. */
     _regs.eflags &= ~EFLG_RF;
 
@@ -5210,7 +5222,18 @@ x86_emulate(
     if ( !mode_64bit() )
         _regs.eip = (uint32_t)_regs.eip;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip    = _regs.eip;
+        ctxt->regs->eflags = _regs.eflags;
+        rc = X86EMUL_OKAY;
+    }
+
+    /* Inject #DB if single-step tracing was enabled at instruction start. */
+    if ( tf && (rc == X86EMUL_OKAY) && ops->inject_hw_exception )
+        rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -111,6 +111,13 @@ struct __packed segment_register {
 #define X86EMUL_RETRY          3
  /* (cmpxchg accessor): CMPXCHG failed. Maps to X86EMUL_RETRY in caller. */
 #define X86EMUL_CMPXCHG_FAILED 3
+ /*
+  * Operation fully done by one of the hooks:
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when use anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -531,6 +538,15 @@ x86emul_unhandleable_rw(
     void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
+/* Unhandleable cmpxchg */
+int
+x86emul_unhandleable_cx(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt);
 
 #ifdef __XEN__
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 16/16] x86emul: don't assume a memory operand
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (14 preceding siblings ...)
  2016-09-28  8:18 ` [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
@ 2016-09-28  8:19 ` Jan Beulich
  2016-09-29 21:12   ` Andrew Cooper
  2016-09-28  8:42 ` [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
  16 siblings, 1 reply; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 3852 bytes --]

Especially for x86_insn_operand_ea() to return dependable segment
information even when the caller didn't consider applicability, we
shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
and set it to OP_MEM when we actually encounter memory like operands.

This requires to eliminate the XSA-123 fix, which has been no longer
necessary since the elimination of the union in commit dd766684e7. That
in turn allows restricting the scope of override_seg to x86_decode().
At this occasion also make it have a proper type, instead of plain int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1647,7 +1647,6 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
-    int override_seg;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -1683,7 +1682,6 @@ struct x86_emulate_state {
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
-#define override_seg (state->override_seg)
 #define ea (state->ea)
 
 static int
@@ -1712,6 +1710,7 @@ x86_decode_onebyte(
     case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
     case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         /* Source EA is not encoded via ModRM. */
+        ea.type = OP_MEM;
         ea.mem.off = insn_fetch_bytes(ad_bytes);
         break;
 
@@ -1802,11 +1801,11 @@ x86_decode(
 {
     uint8_t b, d, sib, sib_index, sib_base;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
+    enum x86_segment override_seg = x86_seg_none;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
-    override_seg = -1;
-    ea.type = OP_MEM;
+    ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
     ea.reg = PTR_POISON;
     state->regs = ctxt->regs;
@@ -2102,6 +2101,7 @@ x86_decode(
         else if ( ad_bytes == 2 )
         {
             /* 16-bit ModR/M decode. */
+            ea.type = OP_MEM;
             switch ( modrm_rm )
             {
             case 0:
@@ -2152,6 +2152,7 @@ x86_decode(
         else
         {
             /* 32/64-bit ModR/M decode. */
+            ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
                 sib = insn_fetch_type(uint8_t);
@@ -2216,7 +2217,7 @@ x86_decode(
         }
     }
 
-    if ( override_seg != -1 && ea.type == OP_MEM )
+    if ( override_seg != x86_seg_none )
         ea.mem.seg = override_seg;
 
     /* Fetch the immediate operand, if present. */
@@ -4253,13 +4254,11 @@ x86_emulate(
             generate_exception_if(limit < sizeof(long) ||
                                   (limit & (limit - 1)), EXC_UD, -1);
             base &= ~(limit - 1);
-            if ( override_seg == -1 )
-                override_seg = x86_seg_ds;
             if ( ops->rep_stos )
             {
                 unsigned long nr_reps = limit / sizeof(zero);
 
-                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
+                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
                                    &nr_reps, ctxt);
                 if ( rc == X86EMUL_OKAY )
                 {
@@ -4271,7 +4270,7 @@ x86_emulate(
             }
             while ( limit )
             {
-                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
+                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 base += sizeof(zero);
@@ -5257,7 +5256,6 @@ x86_emulate(
 #undef rex_prefix
 #undef lock_prefix
 #undef vex
-#undef override_seg
 #undef ea
 
 #ifdef __XEN__




[-- Attachment #2: x86emul-ea-init-OP_NONE.patch --]
[-- Type: text/plain, Size: 3888 bytes --]

x86emul: don't assume a memory operand

Especially for x86_insn_operand_ea() to return dependable segment
information even when the caller didn't consider applicability, we
shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
and set it to OP_MEM when we actually encounter memory like operands.

This requires to eliminate the XSA-123 fix, which has been no longer
necessary since the elimination of the union in commit dd766684e7. That
in turn allows restricting the scope of override_seg to x86_decode().
At this occasion also make it have a proper type, instead of plain int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1647,7 +1647,6 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
-    int override_seg;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -1683,7 +1682,6 @@ struct x86_emulate_state {
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
-#define override_seg (state->override_seg)
 #define ea (state->ea)
 
 static int
@@ -1712,6 +1710,7 @@ x86_decode_onebyte(
     case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
     case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         /* Source EA is not encoded via ModRM. */
+        ea.type = OP_MEM;
         ea.mem.off = insn_fetch_bytes(ad_bytes);
         break;
 
@@ -1802,11 +1801,11 @@ x86_decode(
 {
     uint8_t b, d, sib, sib_index, sib_base;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
+    enum x86_segment override_seg = x86_seg_none;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
-    override_seg = -1;
-    ea.type = OP_MEM;
+    ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
     ea.reg = PTR_POISON;
     state->regs = ctxt->regs;
@@ -2102,6 +2101,7 @@ x86_decode(
         else if ( ad_bytes == 2 )
         {
             /* 16-bit ModR/M decode. */
+            ea.type = OP_MEM;
             switch ( modrm_rm )
             {
             case 0:
@@ -2152,6 +2152,7 @@ x86_decode(
         else
         {
             /* 32/64-bit ModR/M decode. */
+            ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
                 sib = insn_fetch_type(uint8_t);
@@ -2216,7 +2217,7 @@ x86_decode(
         }
     }
 
-    if ( override_seg != -1 && ea.type == OP_MEM )
+    if ( override_seg != x86_seg_none )
         ea.mem.seg = override_seg;
 
     /* Fetch the immediate operand, if present. */
@@ -4253,13 +4254,11 @@ x86_emulate(
             generate_exception_if(limit < sizeof(long) ||
                                   (limit & (limit - 1)), EXC_UD, -1);
             base &= ~(limit - 1);
-            if ( override_seg == -1 )
-                override_seg = x86_seg_ds;
             if ( ops->rep_stos )
             {
                 unsigned long nr_reps = limit / sizeof(zero);
 
-                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
+                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
                                    &nr_reps, ctxt);
                 if ( rc == X86EMUL_OKAY )
                 {
@@ -4271,7 +4270,7 @@ x86_emulate(
             }
             while ( limit )
             {
-                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
+                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 base += sizeof(zero);
@@ -5257,7 +5256,6 @@ x86_emulate(
 #undef rex_prefix
 #undef lock_prefix
 #undef vex
-#undef override_seg
 #undef ea
 
 #ifdef __XEN__

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 00/16] x86: split insn emulator decode and execution
  2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
                   ` (15 preceding siblings ...)
  2016-09-28  8:19 ` [PATCH v2 16/16] x86emul: don't assume a memory operand Jan Beulich
@ 2016-09-28  8:42 ` Jan Beulich
  16 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-28  8:42 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

>>> On 28.09.16 at 09:59, <JBeulich@suse.com> wrote:
> 01: x86emul: split instruction decoding from execution
> 02: x86emul: fetch all insn bytes during the decode phase
> 03: x86emul: track only rIP in emulator state
> 04: x86emul: complete decoding of two-byte instructions
> 05: x86emul: add XOP decoding
> 06: x86emul: add EVEX decoding
> 07: x86emul: generate and make use of a canonical opcode representation
> 08: SVM: use generic instruction decoding
> 09: x86/32on64: use generic instruction decoding
> 10: x86/PV: split out dealing with CRn from privileged instruction handling
> 11: x86/PV: split out dealing with DRn from privileged instruction handling
> 12: x86/PV: split out dealing with MSRs from privileged instruction handling
> 13: x86emul: support XSETBV
> 14: x86emul: sort opcode 0f01 special case switch() statement
> 15: x86/PV: use generic emulator for privileged instruction handling
> 16: x86emul: don't assume a memory operand

Several of the patches don't really depend on (all) earlier ones. It is
in particular patch 12 which I'd really appreciate if it could go in rather
sooner than later, due to it otherwise being unduly cumbersome for
me to make further changes in this area (as was e.g. required for
https://lists.xenproject.org/archives/html/xen-devel/2016-09/msg01669.html,
and which is not all that unlikely to become necessary).

I'll also refrain from posting a v2 of the follow-up series until that one
did actually get looked at (or until a meaningful part of the series here
has gone in).

Thanks, Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 01/16] x86emul: split instruction decoding from execution
  2016-09-28  8:06 ` [PATCH v2 01/16] x86emul: split instruction decoding from execution Jan Beulich
@ 2016-09-28 16:24   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-28 16:24 UTC (permalink / raw)
  To: Jan Beulich, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 292 bytes --]

On 28/09/16 09:06, Jan Beulich wrote:
> This is only the mechanical part, a subsequent patch will make non-
> mechanical adjustments to actually do all decoding in this new
> function.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

[-- Attachment #1.2: Type: text/html, Size: 843 bytes --]

[-- Attachment #2: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 02/16] x86emul: fetch all insn bytes during the decode phase
  2016-09-28  8:07 ` [PATCH v2 02/16] x86emul: fetch all insn bytes during the decode phase Jan Beulich
@ 2016-09-28 16:37   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-28 16:37 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:07, Jan Beulich wrote:
> This way we can offer to callers the service of just sizing
> instructions, and we also can better guarantee not to raise the wrong
> fault due to not having read all relevant bytes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 03/16] x86emul: track only rIP in emulator state
  2016-09-28  8:08 ` [PATCH v2 03/16] x86emul: track only rIP in emulator state Jan Beulich
@ 2016-09-28 16:41   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-28 16:41 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:08, Jan Beulich wrote:
> Now that all decoding happens in x86_decode() there's no need to keep
> the local registers copy in struct x86_emulate_state. Only rIP gets
> updated in the decode phase, so only that register needs tracking
> there. All other (read-only) registers can be read from the original
> structure (but sadly, due to it getting passed to decode_register(),
> the pointer can't be made point to "const" to make the compiler help
> ensure no modification happens).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions
  2016-09-28  8:08 ` [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions Jan Beulich
@ 2016-09-28 17:22   ` Andrew Cooper
  2016-09-29  6:37     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-28 17:22 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:08, Jan Beulich wrote:
> @@ -1651,6 +1668,34 @@ x86_decode_onebyte(
>  }
>  
>  static int
> +x86_decode_twobyte(
> +    struct x86_emulate_state *state,
> +    struct x86_emulate_ctxt *ctxt,
> +    const struct x86_emulate_ops *ops)
> +{
> +    int rc = X86EMUL_OKAY;
> +
> +    switch ( state->opcode )
> +    {
> +    case 0x78:

It occurs to me (after spending far too long attempting to locate these
instructions in the manual), that we should consider where the cpuid
checks should go.

This opcode is particularly awkward.  Without a prefix, it is Intel's
VMREAD instruction, but with 66 or f2, it becomes part of AMD's SSE4A. 
(I guess this answers why the VT-x instructions strictly may not be
encoded with legacy prefixes.)

On real hardware, the cpuid check would logically be here, as it
indicates an inability of the pipeline to understand an instruction. 
OTOH, there is an argument to be made about sizing instructions which
are understood, but shouldn't be successfully emulated.

I don't think any of this is a blocker to the patch, so Reviewed-by:
Andrew Cooper <andrew.cooper3@citrix.com>, but we certainly should
consider it.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions
  2016-09-28 17:22   ` Andrew Cooper
@ 2016-09-29  6:37     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-29  6:37 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 28.09.16 at 19:22, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:08, Jan Beulich wrote:
>> @@ -1651,6 +1668,34 @@ x86_decode_onebyte(
>>  }
>>  
>>  static int
>> +x86_decode_twobyte(
>> +    struct x86_emulate_state *state,
>> +    struct x86_emulate_ctxt *ctxt,
>> +    const struct x86_emulate_ops *ops)
>> +{
>> +    int rc = X86EMUL_OKAY;
>> +
>> +    switch ( state->opcode )
>> +    {
>> +    case 0x78:
> 
> It occurs to me (after spending far too long attempting to locate these
> instructions in the manual), that we should consider where the cpuid
> checks should go.
> 
> This opcode is particularly awkward.  Without a prefix, it is Intel's
> VMREAD instruction, but with 66 or f2, it becomes part of AMD's SSE4A. 
> (I guess this answers why the VT-x instructions strictly may not be
> encoded with legacy prefixes.)
> 
> On real hardware, the cpuid check would logically be here, as it
> indicates an inability of the pipeline to understand an instruction. 
> OTOH, there is an argument to be made about sizing instructions which
> are understood, but shouldn't be successfully emulated.

True - if only there weren't these exceptions of certain opcodes
being known to #UD yet also decoding e.g. a ModRM byte before
they actually do.

Furthermore, the decoding would get more complicated: We'd have
to check both CPUID bits here, while having them in the execution
phase allows them to sit cleanly in different case statements. Plus if
we were to not fetch further bytes upon recognizing an insn that
is to #UD, putting the check here would even be too late (except
for the exceptions mentioned above): We'd have to put the check
before fetching the ModRM or immediate operand bytes.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 05/16] x86emul: add XOP decoding
  2016-09-28  8:09 ` [PATCH v2 05/16] x86emul: add XOP decoding Jan Beulich
@ 2016-09-29  9:07   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29  9:07 UTC (permalink / raw)
  To: Jan Beulich, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 402 bytes --]

On 28/09/16 09:09, Jan Beulich wrote:
> This way we can at least size (and e.g. skip) them if needed, and we
> also won't raise the wrong fault due to not having read all relevant
> bytes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>.  I think... The
prefix decoding is getting very complicated, but I can't suggest an
easier way.

~Andrew

[-- Attachment #1.2: Type: text/html, Size: 956 bytes --]

[-- Attachment #2: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 06/16] x86emul: add EVEX decoding
  2016-09-28  8:10 ` [PATCH v2 06/16] x86emul: add EVEX decoding Jan Beulich
@ 2016-09-29  9:08   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29  9:08 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:10, Jan Beulich wrote:
> This way we can at least size (and e.g. skip) them if needed, and we
> also won't raise the wrong fault due to not having read all relevant
> bytes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 07/16] x86emul: generate and make use of a canonical opcode representation
  2016-09-28  8:12 ` [PATCH v2 07/16] x86emul: generate and make use of a canonical opcode representation Jan Beulich
@ 2016-09-29 10:11   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 10:11 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:12, Jan Beulich wrote:
> @@ -1732,13 +1745,35 @@ x86_decode_twobyte(
>  }
>  
>  static int
> +x86_decode_0f38(
> +    struct x86_emulate_state *state,
> +    struct x86_emulate_ctxt *ctxt,
> +    const struct x86_emulate_ops *ops)
> +{
> +    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
> +    {
> +    case 0x00 ... 0xef:
> +    case 0xf2 ... 0xff:
> +        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
> +        break;
> +
> +    case 0xf0: case 0xf1: /* movbe / crc32 */
> +        if ( rep_prefix() )
> +            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
> +        break;
> +    }
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +static int
>  x86_decode(
>      struct x86_emulate_state *state,
>      struct x86_emulate_ctxt *ctxt,
>      const struct x86_emulate_ops  *ops)
>  {
>      uint8_t b, d, sib, sib_index, sib_base;
> -    unsigned int def_op_bytes, def_ad_bytes;
> +    unsigned int def_op_bytes, def_ad_bytes, opcode;
>      int rc = X86EMUL_OKAY;
>  
>      memset(state, 0, sizeof(*state));
> @@ -1819,29 +1854,31 @@ x86_decode(
>  
>      /* Opcode byte(s). */
>      d = opcode_table[b];
> -    if ( d == 0 )
> +    if ( d == 0 && b == 0x0f)

Spaces.

Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 08/16] SVM: use generic instruction decoding
  2016-09-28  8:13 ` [PATCH v2 08/16] SVM: use generic instruction decoding Jan Beulich
@ 2016-09-29 19:24   ` Andrew Cooper
  2016-09-29 19:30     ` Andrew Cooper
  2016-09-30  8:32     ` Jan Beulich
  0 siblings, 2 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 19:24 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Boris Ostrovsky, Suravee Suthikulpanit

On 28/09/16 09:13, Jan Beulich wrote:
> ... instead of custom handling. To facilitate this break out init code
> from _hvm_emulate_one() into the new hvm_emulate_init(), and make
> hvmemul_insn_fetch( globally available.

)

>  int __get_instruction_length_from_list(struct vcpu *v,
>          const enum instruction_index *list, unsigned int list_count)
>  {
>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> -    unsigned int i, j, inst_len = 0;
> -    enum instruction_index instr = 0;
> -    u8 buf[MAX_INST_LEN];
> -    const u8 *opcode = NULL;
> -    unsigned long fetch_addr, fetch_limit;
> -    unsigned int fetch_len, max_len;
> +    struct hvm_emulate_ctxt ctxt;
> +    struct x86_emulate_state *state;
> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
> +    int modrm_mod;
>  

Despite knowing how this works, it is still confusing to read.  Do you
mind putting in a comment such as:

/* In a debug build, always use x86_decode_insn() and compare with
hardware. */

> +#ifdef NDEBUG
>      if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
>          return inst_len;
>  
>      if ( vmcb->exitcode == VMEXIT_IOIO )
>          return vmcb->exitinfo2 - vmcb->rip;
> +#endif
>  
> -    /* Fetch up to the next page break; we'll fetch from the next page
> -     * later if we have to. */
> -    fetch_addr = svm_rip2pointer(v, &fetch_limit);
> -    if ( vmcb->rip > fetch_limit )
> -        return 0;
> -    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
> -    fetch_len = min_t(unsigned int, max_len,
> -                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
> -    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
> +    ASSERT(v == current);
> +    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
> +    hvm_emulate_init(&ctxt, NULL, 0);
> +    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
> +    if ( IS_ERR_OR_NULL(state) )
>          return 0;
>  
> -    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
> -    {
> -        inst_len++;
> -        if ( inst_len >= fetch_len )
> -        {
> -            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
> -                        max_len - fetch_len) )
> -                return 0;
> -            fetch_len = max_len;
> -        }
> +    inst_len = x86_insn_length(state, &ctxt.ctxt);
> +    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
> +    x86_emulate_free_state(state);
> +#ifndef NDEBUG
> +    if ( vmcb->exitcode == VMEXIT_IOIO )
> +        j = vmcb->exitinfo2 - vmcb->rip;
> +    else
> +        j = svm_nextrip_insn_length(v);
> +    if ( j && j != inst_len )
> +    {
> +        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
> +                ctxt.ctxt.opcode, inst_len, j);
> +        return j;
>      }
> +#endif
>  
>      for ( j = 0; j < list_count; j++ )
>      {
> -        instr = list[j];
> -        opcode = opc_bytes[instr];
> +        enum instruction_index instr = list[j];
>  
> -        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
> +        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));

This is another ASSERT() used as a bounds check, and will suffer a build
failure on clang.

You need to use s/enum instruction_index/unsigned int/ to fix the build
issue.  Can I also request the use of

if ( instr >= ARRAY_SIZE(opc_tab) )
{
    ASSERT_UNREACHABLE();
    return 0;
}

instead?

> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -5200,3 +5214,89 @@ x86_emulate(
>  #undef vex
>  #undef override_seg
>  #undef ea
> +
> +#ifdef __XEN__
> +
> +#include <xen/err.h>
> +
> +struct x86_emulate_state *
> +x86_decode_insn(
> +    struct x86_emulate_ctxt *ctxt,
> +    int (*insn_fetch)(
> +        enum x86_segment seg, unsigned long offset,
> +        void *p_data, unsigned int bytes,
> +        struct x86_emulate_ctxt *ctxt))
> +{
> +    static DEFINE_PER_CPU(struct x86_emulate_state, state);
> +    struct x86_emulate_state *state = &this_cpu(state);
> +    const struct x86_emulate_ops ops = {

This can be static, to avoid having it reconstructed on the stack each
function call.

Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 08/16] SVM: use generic instruction decoding
  2016-09-29 19:24   ` Andrew Cooper
@ 2016-09-29 19:30     ` Andrew Cooper
  2016-09-30  8:32     ` Jan Beulich
  1 sibling, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 19:30 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Boris Ostrovsky, Suravee Suthikulpanit

On 29/09/16 20:24, Andrew Cooper wrote:
>
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -5200,3 +5214,89 @@ x86_emulate(
>>  #undef vex
>>  #undef override_seg
>>  #undef ea
>> +
>> +#ifdef __XEN__
>> +
>> +#include <xen/err.h>
>> +
>> +struct x86_emulate_state *
>> +x86_decode_insn(
>> +    struct x86_emulate_ctxt *ctxt,
>> +    int (*insn_fetch)(
>> +        enum x86_segment seg, unsigned long offset,
>> +        void *p_data, unsigned int bytes,
>> +        struct x86_emulate_ctxt *ctxt))
>> +{
>> +    static DEFINE_PER_CPU(struct x86_emulate_state, state);
>> +    struct x86_emulate_state *state = &this_cpu(state);
>> +    const struct x86_emulate_ops ops = {
> This can be static, to avoid having it reconstructed on the stack each
> function call.

On further consideration, it is clear that this can't be static.  Sorry
for the noise.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation
  2016-09-28  8:13 ` [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-09-29 19:47   ` Andrew Cooper
  2016-09-30  7:30     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 19:47 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:13, Jan Beulich wrote:
> @@ -3204,179 +3285,59 @@ static void emulate_gate_op(struct cpu_u
>          return;
>      }
>  
> -    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
> -    ad_default = ad_bytes = op_default;
> -    opnd_sel = opnd_off = 0;
> -    jump = -1;
> -    for ( eip = regs->eip; eip - regs->_eip < 10; )
> +    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
> +    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */

Are you sure this is safe?  What if the instruction is substituted under
our feet?

Currently, the only issues I can spot would be a load of "& 0" in
truncate_word() and friends, but my gut feeling is that this is not a
safe or sensible thing to rely on.

Everything else looks fine though.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling
  2016-09-28  8:14 ` [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
@ 2016-09-29 20:01   ` Andrew Cooper
  2016-09-30  7:12     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 20:01 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:14, Jan Beulich wrote:
> This is in preparation for using the generic emulator here.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -2255,6 +2255,107 @@ unsigned long guest_to_host_gpr_switch(u
>  
>  void (*pv_post_outb_hook)(unsigned int port, u8 value);
>  
> +static int priv_op_read_cr(unsigned int reg, unsigned long *val,
> +                           struct x86_emulate_ctxt *ctxt)
> +{
> +    const struct vcpu *curr = current;
> +
> +    switch ( reg )
> +    {
> +    case 0: /* Read CR0 */
> +        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
> +        return X86EMUL_OKAY;
> +
> +    case 2: /* Read CR2 */
> +    case 4: /* Read CR4 */
> +        *val = curr->arch.pv_vcpu.ctrlreg[reg];
> +        return X86EMUL_OKAY;
> +
> +    case 3: /* Read CR3 */
> +    {
> +        const struct domain *currd = curr->domain;
> +        unsigned long mfn;

Any chance of switching this to mfn_t while you are moving it?

> +
> +        if ( !is_pv_32bit_domain(currd) )
> +        {
> +            mfn = pagetable_get_pfn(curr->arch.guest_table);
> +            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
> +        }
> +        else
> +        {
> +            l4_pgentry_t *pl4e =
> +                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
> +
> +            mfn = l4e_get_pfn(*pl4e);
> +            unmap_domain_page(pl4e);
> +            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
> +        }
> +        /* PTs should not be shared */
> +        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
> +        return X86EMUL_OKAY;
> +    }
> +    }
> +
> +    return X86EMUL_UNHANDLEABLE;
> +}
> +
> +static int priv_op_write_cr(unsigned int reg, unsigned long val,
> +                            struct x86_emulate_ctxt *ctxt)
> +{
> +    struct vcpu *curr = current;
> +
> +    switch ( reg )
> +    {
> +    case 0: /* Write CR0 */
> +        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
> +        {
> +            gdprintk(XENLOG_WARNING,
> +                    "Attempt to change unmodifiable CR0 flags\n");
> +            break;
> +        }
> +        do_fpu_taskswitch(!!(val & X86_CR0_TS));
> +        return X86EMUL_OKAY;
> +
> +    case 2: /* Write CR2 */
> +        curr->arch.pv_vcpu.ctrlreg[2] = val;
> +        arch_set_cr2(curr, val);
> +        return X86EMUL_OKAY;
> +
> +    case 3: /* Write CR3 */
> +    {
> +        struct domain *currd = curr->domain;
> +        unsigned long gfn;

Similarly, gfn_t ?

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 11/16] x86/PV: split out dealing with DRn from privileged instruction handling
  2016-09-28  8:15 ` [PATCH v2 11/16] x86/PV: split out dealing with DRn " Jan Beulich
@ 2016-09-29 20:13   ` Andrew Cooper
  2016-09-30  7:16     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 20:13 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:15, Jan Beulich wrote:
> This is in preparation for using the generic emulator here.
>
> Some care is needed temporarily to not unduly alter guest register
> state: The local variable "res" can only go away once this code got
> fully switched over to using x86_emulate().
>
> Also switch to IS_ERR_VALUE() instead of (incorrectly) open coding it.

It isn't actually an ERR_PTR().  That bit of code pre-dates the
introduction of ERR_PTR() by some margin.

The return code of do_get_debugreg() is broken and needs fixing, along
with the ABI of the debugreg hypercalls.

This change does cause an ABI change for PV guests, as they now can't
read a debug register whose value is in the top 4k of linear address
space, (ather than the top 8th of a page previously), but given that the
ABI is already known broken, I am not sure I care too much.

Either way, keeping it like this, or switching back to the previous
opencoding, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 12/16] x86/PV: split out dealing with MSRs from privileged instruction handling
  2016-09-28  8:16 ` [PATCH v2 12/16] x86/PV: split out dealing with MSRs " Jan Beulich
@ 2016-09-29 20:44   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 20:44 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:16, Jan Beulich wrote:
> This is in preparation for using the generic emulator here.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

This looks like only code motion, so Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

There is some rather unhelpful behaviour with the cases where we
silently discard access to MSRs such as MSR_FAM10H_MMIO_CONF_BASE, but
that is definitely not something to fix now.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 13/16] x86emul: support XSETBV
  2016-09-28  8:17 ` [PATCH v2 13/16] x86emul: support XSETBV Jan Beulich
@ 2016-09-29 20:45   ` Andrew Cooper
  2016-09-30  8:05     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 20:45 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:17, Jan Beulich wrote:
> This is a prereq for switching PV privileged op emulation to the
> generic instruction emulator. Since handle_xsetbv() is already capable
> of dealing with all guest kinds, avoid introducing another hook here.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 14/16] x86emul: sort opcode 0f01 special case switch() statement
  2016-09-28  8:18 ` [PATCH v2 14/16] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
@ 2016-09-29 20:46   ` Andrew Cooper
  0 siblings, 0 replies; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 20:46 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:18, Jan Beulich wrote:
> Sort the special case opcode 0f01 entries numerically, insert blank
> lines between each of the cases, and properly place opening braces.
>
> No functional change.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling
  2016-09-28  8:18 ` [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
@ 2016-09-29 21:06   ` Andrew Cooper
  2016-09-30  8:55     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 21:06 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:18, Jan Beulich wrote:
> There's a new emulator return code being added to allow bypassing
> certain operations (see the code comment). Its handling in the epilogue
> code involves moving the raising of the single step trap until after
> registers were updated. This should probably have been that way from
> the beginning, to allow the inject_hw_exception() hook to see updated
> register state (in case it cares) - it's a trap, after all.

I agree.  (However, given the complexity of this patch, it really would
be better to split changes like the #DB handling out into a separate patch).

>
> The other small tweak to the emulator is to single iteration handling
> of INS and OUTS: Since we don't want to handle any other memory access
> instructions, we want these to be handled by the rep_ins() / rep_outs()
> hooks here too. The read() / write() hook pointers get checked for that
> purpose.

Moving the non-rep INS/OUTS instructions into rep_ins/outs() (perhaps
with dropping the rep_ prefix from the callback names) seems sensible.

However, making this implicit on a check against the read/write hooks
doesn't seem sensible.  Anyone looking at the code is going to get
thoroughly confused.

Can't we make the ins/outs hook deal properly with a rep of 1, and have
x86_emulate() know not to update %ecx in this case?

>
> And finally handling of exceptions gets changed for REP INS / REP OUTS:
> If the hook return X86EMUL_EXCEPTION, register state will still get
> updated if some iterations have been performed (but the rIP update will
> get suppressed if not all of them did get handled).

Isn't this what happens on real hardware anyway?

>  While on the HVM side
> the VA -> LA -> PA translation process clips the number of repetitions,
> doing so would unduly complicate the PV side code being added here.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> One thing to be considered is that despite avoiding the handling of
> memory reads and writes (other than for INS and OUTS) the set of insns
> now getting potentially handled by the emulator is much larger than
> before. A possible solution to this would be a new hook to be called
> between decode and execution stages, allowing further restrictions to
> be enforced. Of course this could easily be a follow-up patch, as the
> one here is quite big already.

I think this would be a very sensible precaution.  I would suggest even
that this patch doesn't get committed without being adjacent to such a
patch.

>
> Another thing to consider is to the extend the X86EMUL_EXCEPTION
> handling change mentioned above to other string instructions. In that
> case this should probably be broken out into a prereq patch.

Yes.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 16/16] x86emul: don't assume a memory operand
  2016-09-28  8:19 ` [PATCH v2 16/16] x86emul: don't assume a memory operand Jan Beulich
@ 2016-09-29 21:12   ` Andrew Cooper
  2016-09-30  8:25     ` Jan Beulich
  0 siblings, 1 reply; 43+ messages in thread
From: Andrew Cooper @ 2016-09-29 21:12 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 28/09/16 09:19, Jan Beulich wrote:
> Especially for x86_insn_operand_ea() to return dependable segment
> information even when the caller didn't consider applicability, we
> shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
> and set it to OP_MEM when we actually encounter memory like operands.
>
> This requires to eliminate the XSA-123 fix, which has been no longer
> necessary since the elimination of the union in commit dd766684e7. That
> in turn allows restricting the scope of override_seg to x86_decode().
> At this occasion also make it have a proper type, instead of plain int.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1647,7 +1647,6 @@ struct x86_emulate_state {
>      opcode_desc_t desc;
>      union vex vex;
>      union evex evex;
> -    int override_seg;
>  
>      /*
>       * Data operand effective address (usually computed from ModRM).
> @@ -1683,7 +1682,6 @@ struct x86_emulate_state {
>  #define lock_prefix (state->lock_prefix)
>  #define vex (state->vex)
>  #define evex (state->evex)
> -#define override_seg (state->override_seg)
>  #define ea (state->ea)
>  
>  static int
> @@ -1712,6 +1710,7 @@ x86_decode_onebyte(
>      case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
>      case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
>          /* Source EA is not encoded via ModRM. */
> +        ea.type = OP_MEM;
>          ea.mem.off = insn_fetch_bytes(ad_bytes);
>          break;
>  
> @@ -1802,11 +1801,11 @@ x86_decode(
>  {
>      uint8_t b, d, sib, sib_index, sib_base;
>      unsigned int def_op_bytes, def_ad_bytes, opcode;
> +    enum x86_segment override_seg = x86_seg_none;
>      int rc = X86EMUL_OKAY;
>  
>      memset(state, 0, sizeof(*state));
> -    override_seg = -1;
> -    ea.type = OP_MEM;
> +    ea.type = OP_NONE;
>      ea.mem.seg = x86_seg_ds;
>      ea.reg = PTR_POISON;
>      state->regs = ctxt->regs;
> @@ -2102,6 +2101,7 @@ x86_decode(
>          else if ( ad_bytes == 2 )
>          {
>              /* 16-bit ModR/M decode. */
> +            ea.type = OP_MEM;
>              switch ( modrm_rm )
>              {
>              case 0:
> @@ -2152,6 +2152,7 @@ x86_decode(
>          else
>          {
>              /* 32/64-bit ModR/M decode. */
> +            ea.type = OP_MEM;
>              if ( modrm_rm == 4 )
>              {
>                  sib = insn_fetch_type(uint8_t);
> @@ -2216,7 +2217,7 @@ x86_decode(
>          }
>      }
>  
> -    if ( override_seg != -1 && ea.type == OP_MEM )
> +    if ( override_seg != x86_seg_none )

I don't see why the "ea.type == OP_MEM" should be dropped at this
point.  We have already set ea.type appropriately for memory
instructions by this point, and it does open up the case where
instructions which would have triggered XSA-123 get incorrect
information reported if queried with x86_insn_operand_ea()

~Andrew

>          ea.mem.seg = override_seg;
>  
>      /* Fetch the immediate operand, if present. */
> @@ -4253,13 +4254,11 @@ x86_emulate(
>              generate_exception_if(limit < sizeof(long) ||
>                                    (limit & (limit - 1)), EXC_UD, -1);
>              base &= ~(limit - 1);
> -            if ( override_seg == -1 )
> -                override_seg = x86_seg_ds;
>              if ( ops->rep_stos )
>              {
>                  unsigned long nr_reps = limit / sizeof(zero);
>  
> -                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
> +                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
>                                     &nr_reps, ctxt);
>                  if ( rc == X86EMUL_OKAY )
>                  {
> @@ -4271,7 +4270,7 @@ x86_emulate(
>              }
>              while ( limit )
>              {
> -                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
> +                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
>                  if ( rc != X86EMUL_OKAY )
>                      goto done;
>                  base += sizeof(zero);
> @@ -5257,7 +5256,6 @@ x86_emulate(
>  #undef rex_prefix
>  #undef lock_prefix
>  #undef vex
> -#undef override_seg
>  #undef ea
>  
>  #ifdef __XEN__
>
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling
  2016-09-29 20:01   ` Andrew Cooper
@ 2016-09-30  7:12     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  7:12 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 29.09.16 at 22:01, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:14, Jan Beulich wrote:
>> +static int priv_op_read_cr(unsigned int reg, unsigned long *val,
>> +                           struct x86_emulate_ctxt *ctxt)
>> +{
>> +    const struct vcpu *curr = current;
>> +
>> +    switch ( reg )
>> +    {
>> +    case 0: /* Read CR0 */
>> +        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
>> +        return X86EMUL_OKAY;
>> +
>> +    case 2: /* Read CR2 */
>> +    case 4: /* Read CR4 */
>> +        *val = curr->arch.pv_vcpu.ctrlreg[reg];
>> +        return X86EMUL_OKAY;
>> +
>> +    case 3: /* Read CR3 */
>> +    {
>> +        const struct domain *currd = curr->domain;
>> +        unsigned long mfn;
> 
> Any chance of switching this to mfn_t while you are moving it?

To be honest, I'd rather not - there's no single place where the
typed variant would already be needed, so all variable references
would become cluttered.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 11/16] x86/PV: split out dealing with DRn from privileged instruction handling
  2016-09-29 20:13   ` Andrew Cooper
@ 2016-09-30  7:16     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  7:16 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 29.09.16 at 22:13, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:15, Jan Beulich wrote:
>> This is in preparation for using the generic emulator here.
>>
>> Some care is needed temporarily to not unduly alter guest register
>> state: The local variable "res" can only go away once this code got
>> fully switched over to using x86_emulate().
>>
>> Also switch to IS_ERR_VALUE() instead of (incorrectly) open coding it.
> 
> It isn't actually an ERR_PTR().  That bit of code pre-dates the
> introduction of ERR_PTR() by some margin.
> 
> The return code of do_get_debugreg() is broken and needs fixing, along
> with the ABI of the debugreg hypercalls.
> 
> This change does cause an ABI change for PV guests, as they now can't
> read a debug register whose value is in the top 4k of linear address
> space, (ather than the top 8th of a page previously), but given that the
> ABI is already known broken, I am not sure I care too much.

Well, the way ERR_PTR() works we imply the valid errno range to
be -1...-4095. I.e. assuming a smaller range here is a latent bug
(becoming an actual one if any such value would get bubbled up
through that path). So I prefer to keep the patch the way it is,
despite its ABI effect - the implication of PV guests not being able
to fully use breakpoints on the last 8th of a page clearly has the
implication that no breakpoint should be put in the last page
anyway (and quite likely no PV guest OS has that page mapped in
the first place).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation
  2016-09-29 19:47   ` Andrew Cooper
@ 2016-09-30  7:30     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  7:30 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 29.09.16 at 21:47, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:13, Jan Beulich wrote:
>> @@ -3204,179 +3285,59 @@ static void emulate_gate_op(struct cpu_u
>>          return;
>>      }
>>  
>> -    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
>> -    ad_default = ad_bytes = op_default;
>> -    opnd_sel = opnd_off = 0;
>> -    jump = -1;
>> -    for ( eip = regs->eip; eip - regs->_eip < 10; )
>> +    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
>> +    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
> 
> Are you sure this is safe?  What if the instruction is substituted under
> our feet?
> 
> Currently, the only issues I can spot would be a load of "& 0" in
> truncate_word() and friends, but my gut feeling is that this is not a
> safe or sensible thing to rely on.

This is safe because (a) x86_decode_insn() won't reach any code
using sp_size and (b) as extra care installs x86emul_unhandleable_rw()
as .read handler (and poisons the .write and .cmpxchg ones).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 13/16] x86emul: support XSETBV
  2016-09-29 20:45   ` Andrew Cooper
@ 2016-09-30  8:05     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  8:05 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 29.09.16 at 22:45, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:17, Jan Beulich wrote:
>> This is a prereq for switching PV privileged op emulation to the
>> generic instruction emulator. Since handle_xsetbv() is already capable
>> of dealing with all guest kinds, avoid introducing another hook here.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks, but this needs a v3 - I didn't notice until my pre-commit
build check that it needs several hunks from "x86emul: conditionally
clear BNDn for branches" moved here (in order to facilitate
including asm/xstate.h in xen/arch/x86/x86_emulate.c.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 16/16] x86emul: don't assume a memory operand
  2016-09-29 21:12   ` Andrew Cooper
@ 2016-09-30  8:25     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  8:25 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 29.09.16 at 23:12, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:19, Jan Beulich wrote:
>> @@ -2216,7 +2217,7 @@ x86_decode(
>>          }
>>      }
>>  
>> -    if ( override_seg != -1 && ea.type == OP_MEM )
>> +    if ( override_seg != x86_seg_none )
> 
> I don't see why the "ea.type == OP_MEM" should be dropped at this
> point.  We have already set ea.type appropriately for memory
> instructions by this point, and it does open up the case where
> instructions which would have triggered XSA-123 get incorrect
> information reported if queried with x86_insn_operand_ea()

The need to remove this actually became apparent with the
testing I did for the priv-op handling, namely for OUTS with a
segment override: When we had (before the patch here)
ea.type start out as OP_MEM, the conditional above was true
_unless_ ea.type got changed later on. With it now (properly
imo) starting out as OP_NONE, instructions not changing it to
OP_MEM (like all the string ones) would not get the segment
override applied anymore.

And no, x86_insn_operand_ea() returns x86_seg_none when
ea.type is anything other than OP_MEM.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 08/16] SVM: use generic instruction decoding
  2016-09-29 19:24   ` Andrew Cooper
  2016-09-29 19:30     ` Andrew Cooper
@ 2016-09-30  8:32     ` Jan Beulich
  1 sibling, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  8:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Boris Ostrovsky, Suravee Suthikulpanit

>>> On 29.09.16 at 21:24, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:13, Jan Beulich wrote:
>> ... instead of custom handling. To facilitate this break out init code
>> from _hvm_emulate_one() into the new hvm_emulate_init(), and make
>> hvmemul_insn_fetch( globally available.
> 
> )
> 
>>  int __get_instruction_length_from_list(struct vcpu *v,
>>          const enum instruction_index *list, unsigned int list_count)
>>  {
>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>> -    unsigned int i, j, inst_len = 0;
>> -    enum instruction_index instr = 0;
>> -    u8 buf[MAX_INST_LEN];
>> -    const u8 *opcode = NULL;
>> -    unsigned long fetch_addr, fetch_limit;
>> -    unsigned int fetch_len, max_len;
>> +    struct hvm_emulate_ctxt ctxt;
>> +    struct x86_emulate_state *state;
>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>> +    int modrm_mod;
>>  
> 
> Despite knowing how this works, it is still confusing to read.  Do you
> mind putting in a comment such as:
> 
> /* In a debug build, always use x86_decode_insn() and compare with
> hardware. */

Sure.

>>      for ( j = 0; j < list_count; j++ )
>>      {
>> -        instr = list[j];
>> -        opcode = opc_bytes[instr];
>> +        enum instruction_index instr = list[j];
>>  
>> -        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
>> +        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
> 
> This is another ASSERT() used as a bounds check, and will suffer a build
> failure on clang.
> 
> You need to use s/enum instruction_index/unsigned int/ to fix the build
> issue.

Oh, right. This predates us having become aware of that clang
issue.

>  Can I also request the use of
> 
> if ( instr >= ARRAY_SIZE(opc_tab) )
> {
>     ASSERT_UNREACHABLE();
>     return 0;
> }
> 
> instead?

Except that I prefer "break" over "return 0" here.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling
  2016-09-29 21:06   ` Andrew Cooper
@ 2016-09-30  8:55     ` Jan Beulich
  0 siblings, 0 replies; 43+ messages in thread
From: Jan Beulich @ 2016-09-30  8:55 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 29.09.16 at 23:06, <andrew.cooper3@citrix.com> wrote:
> On 28/09/16 09:18, Jan Beulich wrote:
>> There's a new emulator return code being added to allow bypassing
>> certain operations (see the code comment). Its handling in the epilogue
>> code involves moving the raising of the single step trap until after
>> registers were updated. This should probably have been that way from
>> the beginning, to allow the inject_hw_exception() hook to see updated
>> register state (in case it cares) - it's a trap, after all.
> 
> I agree.  (However, given the complexity of this patch, it really would
> be better to split changes like the #DB handling out into a separate patch).

Okay.

>> The other small tweak to the emulator is to single iteration handling
>> of INS and OUTS: Since we don't want to handle any other memory access
>> instructions, we want these to be handled by the rep_ins() / rep_outs()
>> hooks here too. The read() / write() hook pointers get checked for that
>> purpose.
> 
> Moving the non-rep INS/OUTS instructions into rep_ins/outs() (perhaps
> with dropping the rep_ prefix from the callback names) seems sensible.
> 
> However, making this implicit on a check against the read/write hooks
> doesn't seem sensible.  Anyone looking at the code is going to get
> thoroughly confused.
> 
> Can't we make the ins/outs hook deal properly with a rep of 1, and have
> x86_emulate() know not to update %ecx in this case?

The former I'll need to check carefully, but I don't expect the
existing handlers to have an issue with count being 1. However
I think the idea not to use the rep handler in that case was that
the other hook would be cheaper, so I'm not sure we really want
to change this universally. Yet I couldn't think of an elegant way
to have the caller control the desired behavior here.

The latter I think already gets taken care of by put_rep_prefix().

>> And finally handling of exceptions gets changed for REP INS / REP OUTS:
>> If the hook return X86EMUL_EXCEPTION, register state will still get
>> updated if some iterations have been performed (but the rIP update will
>> get suppressed if not all of them did get handled).
> 
> Isn't this what happens on real hardware anyway?

Yes, but this case so far was of no interest (and hence not
implemented that way), since the way the HVM emulation code
would only ever return X86EMUL_OKAY when doing a partial
access (due to clipping count suitably before doing the first
iteration, which is not a model we can easily use in the PV case).

>>  While on the HVM side
>> the VA -> LA -> PA translation process clips the number of repetitions,
>> doing so would unduly complicate the PV side code being added here.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> One thing to be considered is that despite avoiding the handling of
>> memory reads and writes (other than for INS and OUTS) the set of insns
>> now getting potentially handled by the emulator is much larger than
>> before. A possible solution to this would be a new hook to be called
>> between decode and execution stages, allowing further restrictions to
>> be enforced. Of course this could easily be a follow-up patch, as the
>> one here is quite big already.
> 
> I think this would be a very sensible precaution.  I would suggest even
> that this patch doesn't get committed without being adjacent to such a
> patch.

Okay, I'll work towards that.

>> Another thing to consider is to the extend the X86EMUL_EXCEPTION
>> handling change mentioned above to other string instructions. In that
>> case this should probably be broken out into a prereq patch.
> 
> Yes.

Okay.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2016-09-30  8:58 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-28  7:59 [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich
2016-09-28  8:06 ` [PATCH v2 01/16] x86emul: split instruction decoding from execution Jan Beulich
2016-09-28 16:24   ` Andrew Cooper
2016-09-28  8:07 ` [PATCH v2 02/16] x86emul: fetch all insn bytes during the decode phase Jan Beulich
2016-09-28 16:37   ` Andrew Cooper
2016-09-28  8:08 ` [PATCH v2 03/16] x86emul: track only rIP in emulator state Jan Beulich
2016-09-28 16:41   ` Andrew Cooper
2016-09-28  8:08 ` [PATCH v2 04/16] x86emul: complete decoding of two-byte instructions Jan Beulich
2016-09-28 17:22   ` Andrew Cooper
2016-09-29  6:37     ` Jan Beulich
2016-09-28  8:09 ` [PATCH v2 05/16] x86emul: add XOP decoding Jan Beulich
2016-09-29  9:07   ` Andrew Cooper
2016-09-28  8:10 ` [PATCH v2 06/16] x86emul: add EVEX decoding Jan Beulich
2016-09-29  9:08   ` Andrew Cooper
2016-09-28  8:12 ` [PATCH v2 07/16] x86emul: generate and make use of a canonical opcode representation Jan Beulich
2016-09-29 10:11   ` Andrew Cooper
2016-09-28  8:13 ` [PATCH v2 08/16] SVM: use generic instruction decoding Jan Beulich
2016-09-29 19:24   ` Andrew Cooper
2016-09-29 19:30     ` Andrew Cooper
2016-09-30  8:32     ` Jan Beulich
2016-09-28  8:13 ` [PATCH v2 09/16] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
2016-09-29 19:47   ` Andrew Cooper
2016-09-30  7:30     ` Jan Beulich
2016-09-28  8:14 ` [PATCH v2 10/16] x86/PV: split out dealing with CRn from privileged instruction handling Jan Beulich
2016-09-29 20:01   ` Andrew Cooper
2016-09-30  7:12     ` Jan Beulich
2016-09-28  8:15 ` [PATCH v2 11/16] x86/PV: split out dealing with DRn " Jan Beulich
2016-09-29 20:13   ` Andrew Cooper
2016-09-30  7:16     ` Jan Beulich
2016-09-28  8:16 ` [PATCH v2 12/16] x86/PV: split out dealing with MSRs " Jan Beulich
2016-09-29 20:44   ` Andrew Cooper
2016-09-28  8:17 ` [PATCH v2 13/16] x86emul: support XSETBV Jan Beulich
2016-09-29 20:45   ` Andrew Cooper
2016-09-30  8:05     ` Jan Beulich
2016-09-28  8:18 ` [PATCH v2 14/16] x86emul: sort opcode 0f01 special case switch() statement Jan Beulich
2016-09-29 20:46   ` Andrew Cooper
2016-09-28  8:18 ` [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
2016-09-29 21:06   ` Andrew Cooper
2016-09-30  8:55     ` Jan Beulich
2016-09-28  8:19 ` [PATCH v2 16/16] x86emul: don't assume a memory operand Jan Beulich
2016-09-29 21:12   ` Andrew Cooper
2016-09-30  8:25     ` Jan Beulich
2016-09-28  8:42 ` [PATCH v2 00/16] x86: split insn emulator decode and execution Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.