All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/5] x86: extend use of generic insn emulator for PV
@ 2016-12-13 11:08 Jan Beulich
  2016-12-13 11:26 ` [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:08 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

1: x86/32on64: use generic instruction decoding for call gate emulation
2: x86emul: generalize exception handling for rep_* hooks
3: x86emul: make write and cmpxchg hooks optional
4: x86/PV: use generic emulator for privileged instruction handling
5: x86/PV: prefer pv_inject_hw_exception()

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Original patch 2 went in. New patch 5. See other patches for
   individual changes.



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
@ 2016-12-13 11:26 ` Jan Beulich
  2016-12-13 15:50   ` Andrew Cooper
  2016-12-13 11:27 ` [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:26 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 14748 bytes --]

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Drop a redundant conditional and the then pointless __addr_ok()
    check. Deliver exceptions when getting back X86EMUL_EXCEPTION
    (addressing a prior re-basing oversight). Retain zero error code
    for failed insn fetches from original code.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3279,13 +3280,94 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    /* We don't mean to emulate any branches. */
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    addr = (uint32_t)addr;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        /*
+         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+         * cpu_has_nx, but we'd then need a "fetch" variant of
+         * __copy_from_user() respecting NX, SMEP, and protection keys.
+         */
+        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3307,7 +3389,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3316,179 +3399,73 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
-            ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
-            ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
+            ASSERT(ctxt.ctxt.event_pending);
+            pv_inject_event(&ctxt.ctxt.event);
+        }
+        else
+        {
+            ASSERT(!ctxt.ctxt.event_pending);
+            do_guest_trap(TRAP_gp_fault, regs);
+        }
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
+    {
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
             break;
-        case 0xff:
-            {
-                unsigned int modrm;
+        switch ( modrm_345 & 7 )
+        {
+            enum x86_segment seg;
 
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+        case 5:
+            ++jump;
+            /* fall through */
+        case 3:
+            ++jump;
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+    if ( rc == X86EMUL_EXCEPTION )
     {
-        do_guest_trap(TRAP_gp_fault, regs);
+        ASSERT(ctxt.ctxt.event_pending);
+        pv_inject_event(&ctxt.ctxt.event);
         return;
     }
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    ASSERT(!ctxt.ctxt.event_pending);
+
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3629,7 +3606,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5532,6 +5532,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5549,6 +5557,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -628,9 +628,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);



[-- Attachment #2: x86-32on64-gate-op-generic-decode.patch --]
[-- Type: text/plain, Size: 14816 bytes --]

x86/32on64: use generic instruction decoding for call gate emulation

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Drop a redundant conditional and the then pointless __addr_ok()
    check. Deliver exceptions when getting back X86EMUL_EXCEPTION
    (addressing a prior re-basing oversight). Retain zero error code
    for failed insn fetches from original code.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3279,13 +3280,94 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    /* We don't mean to emulate any branches. */
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    addr = (uint32_t)addr;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        /*
+         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+         * cpu_has_nx, but we'd then need a "fetch" variant of
+         * __copy_from_user() respecting NX, SMEP, and protection keys.
+         */
+        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3307,7 +3389,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3316,179 +3399,73 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
-            ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
-            ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
+            ASSERT(ctxt.ctxt.event_pending);
+            pv_inject_event(&ctxt.ctxt.event);
+        }
+        else
+        {
+            ASSERT(!ctxt.ctxt.event_pending);
+            do_guest_trap(TRAP_gp_fault, regs);
+        }
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
+    {
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
             break;
-        case 0xff:
-            {
-                unsigned int modrm;
+        switch ( modrm_345 & 7 )
+        {
+            enum x86_segment seg;
 
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+        case 5:
+            ++jump;
+            /* fall through */
+        case 3:
+            ++jump;
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+    if ( rc == X86EMUL_EXCEPTION )
     {
-        do_guest_trap(TRAP_gp_fault, regs);
+        ASSERT(ctxt.ctxt.event_pending);
+        pv_inject_event(&ctxt.ctxt.event);
         return;
     }
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    ASSERT(!ctxt.ctxt.event_pending);
+
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3629,7 +3606,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5532,6 +5532,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5549,6 +5557,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -628,9 +628,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks
  2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
  2016-12-13 11:26 ` [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-12-13 11:27 ` Jan Beulich
  2016-12-13 15:15   ` Paul Durrant
  2016-12-13 11:28 ` [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:27 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Paul Durrant

[-- Attachment #1: Type: text/plain, Size: 6676 bytes --]

If any of those hooks returns X86EMUL_EXCEPTION, some register state
should still get updated if some iterations have been performed (but
the rIP update will get suppressed if not all of them did get handled).
This updating is done by register_address_increment() and
__put_rep_prefix() (which hence must no longer be bypassed). As a
result put_rep_prefix() can then skip most of the writeback, but needs
to ensure proper completion of the executed instruction.

While on the HVM side the VA -> LA -> PA translation process ensures
that an exception would be raised on the first iteration only, doing so
would unduly complicate the PV side code about to be added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v4: Move changes to xen/arch/x86/hvm/emulate.c here (from a later
    patch). Re-base.
v3: Broken out from a later patch.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         x86_emul_pagefault(pfec, addr, &hvmemul_ctxt->ctxt);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 x86_emul_pagefault(pfec, addr & PAGE_MASK, &hvmemul_ctxt->ctxt);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -572,6 +573,7 @@ static int hvmemul_virtual_to_linear(
      * neither know the exact error code to be used, nor can we easily
      * determine the kind of exception (#GP or #TS) in that case.
      */
+    *reps = 0;
     if ( is_x86_user_segment(seg) )
         x86_emul_hw_exception((seg == x86_seg_ss)
                               ? TRAP_stack_error
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -971,7 +971,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -2919,14 +2923,9 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2936,6 +2935,8 @@ x86_emulate(
         }
         register_address_adjust(_regs.edi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2946,14 +2947,9 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2965,6 +2961,8 @@ x86_emulate(
         }
         register_address_adjust(_regs.esi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3187,15 +3185,10 @@ x86_emulate(
         dst.mem.seg = x86_seg_es;
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         src.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
-        if ( (nr_reps > 1) && (ops->rep_movs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_movs ||
              ((rc = ops->rep_movs(ea.mem.seg, src.mem.off,
                                   dst.mem.seg, dst.mem.off, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, src.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -3206,6 +3199,8 @@ x86_emulate(
         register_address_adjust(_regs.esi, nr_reps * dst.bytes);
         register_address_adjust(_regs.edi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3244,11 +3239,12 @@ x86_emulate(
             dst.val = src.val;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
-        else if ( rc != X86EMUL_OKAY )
-            goto done;
         register_address_adjust(_regs.edi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 



[-- Attachment #2: x86emul-rep-exception.patch --]
[-- Type: text/plain, Size: 6730 bytes --]

x86emul: generalize exception handling for rep_* hooks

If any of those hooks returns X86EMUL_EXCEPTION, some register state
should still get updated if some iterations have been performed (but
the rIP update will get suppressed if not all of them did get handled).
This updating is done by register_address_increment() and
__put_rep_prefix() (which hence must no longer be bypassed). As a
result put_rep_prefix() can then skip most of the writeback, but needs
to ensure proper completion of the executed instruction.

While on the HVM side the VA -> LA -> PA translation process ensures
that an exception would be raised on the first iteration only, doing so
would unduly complicate the PV side code about to be added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v4: Move changes to xen/arch/x86/hvm/emulate.c here (from a later
    patch). Re-base.
v3: Broken out from a later patch.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         x86_emul_pagefault(pfec, addr, &hvmemul_ctxt->ctxt);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 x86_emul_pagefault(pfec, addr & PAGE_MASK, &hvmemul_ctxt->ctxt);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -572,6 +573,7 @@ static int hvmemul_virtual_to_linear(
      * neither know the exact error code to be used, nor can we easily
      * determine the kind of exception (#GP or #TS) in that case.
      */
+    *reps = 0;
     if ( is_x86_user_segment(seg) )
         x86_emul_hw_exception((seg == x86_seg_ss)
                               ? TRAP_stack_error
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -971,7 +971,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -2919,14 +2923,9 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2936,6 +2935,8 @@ x86_emulate(
         }
         register_address_adjust(_regs.edi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2946,14 +2947,9 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2965,6 +2961,8 @@ x86_emulate(
         }
         register_address_adjust(_regs.esi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3187,15 +3185,10 @@ x86_emulate(
         dst.mem.seg = x86_seg_es;
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         src.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
-        if ( (nr_reps > 1) && (ops->rep_movs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_movs ||
              ((rc = ops->rep_movs(ea.mem.seg, src.mem.off,
                                   dst.mem.seg, dst.mem.off, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, src.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -3206,6 +3199,8 @@ x86_emulate(
         register_address_adjust(_regs.esi, nr_reps * dst.bytes);
         register_address_adjust(_regs.edi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3244,11 +3239,12 @@ x86_emulate(
             dst.val = src.val;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
-        else if ( rc != X86EMUL_OKAY )
-            goto done;
         register_address_adjust(_regs.edi, nr_reps * dst.bytes);
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional
  2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
  2016-12-13 11:26 ` [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
  2016-12-13 11:27 ` [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
@ 2016-12-13 11:28 ` Jan Beulich
  2016-12-13 15:52   ` Andrew Cooper
  2016-12-13 11:28 ` [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
  2016-12-13 11:29 ` [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception() Jan Beulich
  4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:28 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 7437 bytes --]

While the read and fetch hooks are basically unavoidable, write and
cmpxchg aren't really needed by that many insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Don't open-code fail_if(). Add ASSERT()s for fetch and read hooks.
    Move a write hook check closer to its use, and add ASSERT()s and
    comments where this is not desirable. Re-base.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1520,6 +1520,7 @@ protmode_load_seg(
     {
         uint32_t new_desc_b = desc.b | a_flag;
 
+        fail_if(!ops->cmpxchg);
         switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
                                     &new_desc_b, sizeof(desc.b), ctxt)) )
         {
@@ -2017,6 +2018,8 @@ x86_decode(
     bool pc_rel = false;
     int rc = X86EMUL_OKAY;
 
+    ASSERT(ops->insn_fetch);
+
     memset(state, 0, sizeof(*state));
     ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
@@ -2494,6 +2497,8 @@ x86_emulate(
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
 
+    ASSERT(ops->read);
+
     rc = x86_decode(&state, ctxt, ops);
     if ( rc != X86EMUL_OKAY )
         return rc;
@@ -2658,13 +2663,18 @@ x86_emulate(
         }
         else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
         {
+            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
             if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) )
                 goto done;
             dst.orig_val = dst.val;
         }
-        else /* Lock prefix is allowed only on RMW instructions. */
+        else
+        {
+            /* Lock prefix is allowed only on RMW instructions. */
             generate_exception_if(lock_prefix, EXC_UD);
+            fail_if(!ops->write);
+        }
         break;
     }
 
@@ -2813,7 +2823,9 @@ x86_emulate(
         unsigned long regs[] = {
             _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
             _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+
         generate_exception_if(mode_64bit(), EXC_UD);
+        fail_if(!ops->write);
         for ( i = 0; i < 8; i++ )
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                                   &regs[i], op_bytes, ctxt)) != 0 )
@@ -3111,7 +3123,7 @@ x86_emulate(
     case 0x9a: /* call (far, absolute) */
         ASSERT(!mode_64bit());
     far_call:
-        fail_if(ops->read_segment == NULL);
+        fail_if(!ops->read_segment || !ops->write);
 
         if ( (rc = ops->read_segment(x86_seg_cs, &sreg, ctxt)) ||
              (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
@@ -3350,6 +3362,7 @@ x86_emulate(
         dst.type = OP_REG;
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.ebp;
+        fail_if(!ops->write);
         if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
                               &_regs.ebp, dst.bytes, ctxt)) )
             goto done;
@@ -4446,6 +4459,7 @@ x86_emulate(
                 else if ( rc != X86EMUL_UNHANDLEABLE )
                     goto done;
             }
+            fail_if(limit && !ops->write);
             while ( limit )
             {
                 rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
@@ -4466,7 +4480,7 @@ x86_emulate(
         case 1: /* sidt */
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            fail_if(ops->read_segment == NULL);
+            fail_if(!ops->read_segment || !ops->write);
             if ( (rc = ops->read_segment(seg, &sreg, ctxt)) )
                 goto done;
             if ( mode_64bit() )
@@ -4503,12 +4517,18 @@ x86_emulate(
             break;
         case 4: /* smsw */
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes;
+            if ( ea.type == OP_MEM )
+            {
+                fail_if(!ops->write);
+                d |= Mov; /* force writeback */
+                ea.bytes = 2;
+            }
+            else
+                ea.bytes = op_bytes;
             dst = ea;
             fail_if(ops->read_cr == NULL);
             if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
                 goto done;
-            d |= Mov; /* force writeback */
             break;
         case 6: /* lmsw */
             fail_if(ops->read_cr == NULL);
@@ -4710,6 +4730,8 @@ x86_emulate(
             if ( !(b & 1) )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write); /* Check before running the stub. */
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
@@ -4724,8 +4746,11 @@ x86_emulate(
         put_fpu(&fic);
         put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
+        {
+            ASSERT(ops->write); /* See the fail_if() above. */
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
+        }
         if ( rc )
             goto done;
         dst.type = OP_NONE;
@@ -4994,6 +5019,8 @@ x86_emulate(
             if ( b == 0x6f )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write); /* Check before running the stub. */
         }
         if ( ea.type == OP_MEM || b == 0x7e )
         {
@@ -5015,8 +5042,11 @@ x86_emulate(
         put_fpu(&fic);
         put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
+        {
+            ASSERT(ops->write); /* See the fail_if() above. */
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
+        }
         if ( rc )
             goto done;
         dst.type = OP_NONE;
@@ -5264,6 +5294,7 @@ x86_emulate(
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        fail_if(!ops->cmpxchg);
         if ( op_bytes == 8 )
             host_and_vcpu_must_have(cx16);
         op_bytes *= 2;
@@ -5396,12 +5427,18 @@ x86_emulate(
              !ctxt->force_writeback )
             /* nothing to do */;
         else if ( lock_prefix )
+        {
+            fail_if(!ops->cmpxchg);
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
                 &dst.val, dst.bytes, ctxt);
+        }
         else
+        {
+            fail_if(!ops->write);
             rc = ops->write(
                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+        }
         if ( rc != 0 )
             goto done;
     default:
@@ -5485,8 +5522,6 @@ x86_decode_insn(
     const struct x86_emulate_ops ops = {
         .insn_fetch = insn_fetch,
         .read       = x86emul_unhandleable_rw,
-        .write      = PTR_POISON,
-        .cmpxchg    = PTR_POISON,
     };
     int rc = x86_decode(state, ctxt, &ops);
 



[-- Attachment #2: x86emul-write-cmpxchg-optional.patch --]
[-- Type: text/plain, Size: 7483 bytes --]

x86emul: make write and cmpxchg hooks optional

While the read and fetch hooks are basically unavoidable, write and
cmpxchg aren't really needed by that many insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Don't open-code fail_if(). Add ASSERT()s for fetch and read hooks.
    Move a write hook check closer to its use, and add ASSERT()s and
    comments where this is not desirable. Re-base.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1520,6 +1520,7 @@ protmode_load_seg(
     {
         uint32_t new_desc_b = desc.b | a_flag;
 
+        fail_if(!ops->cmpxchg);
         switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
                                     &new_desc_b, sizeof(desc.b), ctxt)) )
         {
@@ -2017,6 +2018,8 @@ x86_decode(
     bool pc_rel = false;
     int rc = X86EMUL_OKAY;
 
+    ASSERT(ops->insn_fetch);
+
     memset(state, 0, sizeof(*state));
     ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
@@ -2494,6 +2497,8 @@ x86_emulate(
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
 
+    ASSERT(ops->read);
+
     rc = x86_decode(&state, ctxt, ops);
     if ( rc != X86EMUL_OKAY )
         return rc;
@@ -2658,13 +2663,18 @@ x86_emulate(
         }
         else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
         {
+            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
             if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) )
                 goto done;
             dst.orig_val = dst.val;
         }
-        else /* Lock prefix is allowed only on RMW instructions. */
+        else
+        {
+            /* Lock prefix is allowed only on RMW instructions. */
             generate_exception_if(lock_prefix, EXC_UD);
+            fail_if(!ops->write);
+        }
         break;
     }
 
@@ -2813,7 +2823,9 @@ x86_emulate(
         unsigned long regs[] = {
             _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
             _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+
         generate_exception_if(mode_64bit(), EXC_UD);
+        fail_if(!ops->write);
         for ( i = 0; i < 8; i++ )
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                                   &regs[i], op_bytes, ctxt)) != 0 )
@@ -3111,7 +3123,7 @@ x86_emulate(
     case 0x9a: /* call (far, absolute) */
         ASSERT(!mode_64bit());
     far_call:
-        fail_if(ops->read_segment == NULL);
+        fail_if(!ops->read_segment || !ops->write);
 
         if ( (rc = ops->read_segment(x86_seg_cs, &sreg, ctxt)) ||
              (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
@@ -3350,6 +3362,7 @@ x86_emulate(
         dst.type = OP_REG;
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.ebp;
+        fail_if(!ops->write);
         if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
                               &_regs.ebp, dst.bytes, ctxt)) )
             goto done;
@@ -4446,6 +4459,7 @@ x86_emulate(
                 else if ( rc != X86EMUL_UNHANDLEABLE )
                     goto done;
             }
+            fail_if(limit && !ops->write);
             while ( limit )
             {
                 rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
@@ -4466,7 +4480,7 @@ x86_emulate(
         case 1: /* sidt */
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            fail_if(ops->read_segment == NULL);
+            fail_if(!ops->read_segment || !ops->write);
             if ( (rc = ops->read_segment(seg, &sreg, ctxt)) )
                 goto done;
             if ( mode_64bit() )
@@ -4503,12 +4517,18 @@ x86_emulate(
             break;
         case 4: /* smsw */
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes;
+            if ( ea.type == OP_MEM )
+            {
+                fail_if(!ops->write);
+                d |= Mov; /* force writeback */
+                ea.bytes = 2;
+            }
+            else
+                ea.bytes = op_bytes;
             dst = ea;
             fail_if(ops->read_cr == NULL);
             if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
                 goto done;
-            d |= Mov; /* force writeback */
             break;
         case 6: /* lmsw */
             fail_if(ops->read_cr == NULL);
@@ -4710,6 +4730,8 @@ x86_emulate(
             if ( !(b & 1) )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write); /* Check before running the stub. */
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
@@ -4724,8 +4746,11 @@ x86_emulate(
         put_fpu(&fic);
         put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
+        {
+            ASSERT(ops->write); /* See the fail_if() above. */
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
+        }
         if ( rc )
             goto done;
         dst.type = OP_NONE;
@@ -4994,6 +5019,8 @@ x86_emulate(
             if ( b == 0x6f )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write); /* Check before running the stub. */
         }
         if ( ea.type == OP_MEM || b == 0x7e )
         {
@@ -5015,8 +5042,11 @@ x86_emulate(
         put_fpu(&fic);
         put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
+        {
+            ASSERT(ops->write); /* See the fail_if() above. */
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
+        }
         if ( rc )
             goto done;
         dst.type = OP_NONE;
@@ -5264,6 +5294,7 @@ x86_emulate(
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        fail_if(!ops->cmpxchg);
         if ( op_bytes == 8 )
             host_and_vcpu_must_have(cx16);
         op_bytes *= 2;
@@ -5396,12 +5427,18 @@ x86_emulate(
              !ctxt->force_writeback )
             /* nothing to do */;
         else if ( lock_prefix )
+        {
+            fail_if(!ops->cmpxchg);
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
                 &dst.val, dst.bytes, ctxt);
+        }
         else
+        {
+            fail_if(!ops->write);
             rc = ops->write(
                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+        }
         if ( rc != 0 )
             goto done;
     default:
@@ -5485,8 +5522,6 @@ x86_decode_insn(
     const struct x86_emulate_ops ops = {
         .insn_fetch = insn_fetch,
         .read       = x86emul_unhandleable_rw,
-        .write      = PTR_POISON,
-        .cmpxchg    = PTR_POISON,
     };
     int rc = x86_decode(state, ctxt, &ops);
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
                   ` (2 preceding siblings ...)
  2016-12-13 11:28 ` [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
@ 2016-12-13 11:28 ` Jan Beulich
  2016-12-13 16:53   ` Andrew Cooper
  2016-12-13 11:29 ` [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception() Jan Beulich
  4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:28 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 47653 bytes --]

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment).

Another small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too.

And then long-mode related bits now get hidden from the guest. This
should have been that way from the beginning, but becomes a requirement
now as the emulator's in_longmode() needs this to reflect guest view.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Rename priv_op_to_linear() to pv_emul_virt_to_linear() and change
    its return type. Refuse system segments and set just one of L and
    DB in 64-bit mode case of priv_op_read_segment(). Have
    pv_emul_cpuid() return X86EMUL_EXCEPTION in the CPUID faulting
    case. Add EFER behavioral change to description.
v3: Re-base. Do away with the special case pointer checks on the ->read
    and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
    for 32-bit guests (to avoid the emulator's in_longmode() returning
    a wrong result). Introduce and use ->validate() hook. Make
    formatting more consistent.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
     pv_inject_event(&event);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1336,7 +1333,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1378,7 +1375,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -2023,6 +2020,155 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+                                  unsigned int bytes, unsigned long limit,
+                                  enum x86_segment seg,
+                                  struct x86_emulate_ctxt *ctxt,
+                                  unsigned long *addr)
+{
+    int rc = X86EMUL_OKAY;
+
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+            rc = X86EMUL_EXCEPTION;
+        *addr = (uint32_t)*addr;
+    }
+    else if ( !__addr_ok(*addr) )
+        rc = X86EMUL_EXCEPTION;
+
+    if ( unlikely(rc == X86EMUL_EXCEPTION) )
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+
+    return rc;
+}
+
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+                              unsigned long offset,
+                              void *p_data,
+                              unsigned int bytes,
+                              struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                                x86_seg_cs, ctxt, &addr);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
+                           addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default: return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            if ( !is_x86_user_segment(seg) )
+                return X86EMUL_UNHANDLEABLE;
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+        {
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+            reg->attr.fields.l = 1;
+        }
+        else
+            reg->attr.fields.db = 1;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2269,6 +2415,238 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, x86_seg_es, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(PFEC_write_access,
+                               addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, seg, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2409,6 +2787,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2436,6 +2815,28 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior, defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_EFER:
+        *val = read_efer();
+        if ( is_pv_32bit_domain(currd) )
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2539,7 +2940,6 @@ static int priv_op_read_msr(unsigned int
         if ( rc )
             return X86EMUL_OKAY;
         /* fall through */
-    case MSR_EFER:
     normal:
         /* Everyone can read the MSR space. */
         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -2761,11 +3161,41 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
+
+    return X86EMUL_OKAY;
+}
+
 int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
                   unsigned int *edx, struct x86_emulate_ctxt *ctxt)
 {
     struct cpu_user_regs regs = *ctxt->regs;
 
+    /*
+     * x86_emulate uses this function to query CPU features for its own
+     * internal use. Make sure we're actually emulating CPUID before checking
+     * for emulated CPUID faulting.
+     */
+    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
+    {
+        const struct vcpu *curr = current;
+
+        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
+        if ( curr->arch.cpuid_faulting &&
+             !guest_kernel_mode(curr, ctxt->regs) )
+            return X86EMUL_EXCEPTION;
+    }
+
     regs._eax = *eax;
     regs._ecx = *ecx;
 
@@ -2779,497 +3209,153 @@ int pv_emul_cpuid(unsigned int *eax, uns
     return X86EMUL_OKAY;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        pv_inject_page_fault(0, _ptr + sizeof(_x) - _rc);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_validate(const struct x86_emulate_state *state,
+                            struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
-
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
+    switch ( ctxt->opcode )
     {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
-
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
-
-        port = (u16)regs->edx;
+    case 0x6c ... 0x6f: /* ins / outs */
+    case 0xe4 ... 0xe7: /* in / out (immediate port) */
+    case 0xec ... 0xef: /* in / out (port in %dx) */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x20) ...
+         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+        return X86EMUL_OKAY;
 
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(PFEC_write_access,
-                                     data_base + rd_ad(edi) + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
+    case 0xfa: case 0xfb: /* cli / sti */
+        if ( !iopl_ok(current, ctxt->regs) )
             break;
-
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(0, data_base + rd_ad(esi)
-                                     + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
-        }
-
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
-
-        goto done;
-    }
-
-    /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
-     */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
-
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
-    {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
-    }
+        return X86EMUL_DONE;
 
-    /* No decode of this single-byte opcode. */
-    goto fail;
+    case X86EMUL_OPC(0x0f, 0x01):
+    {
+        unsigned int modrm_rm, modrm_reg;
 
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
+        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+             (modrm_rm & 7) != 1 )
             break;
-        case 0xd1: /* XSETBV */
+        switch ( modrm_reg & 7 )
         {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+        case 2: /* xsetbv */
+        case 7: /* rdtscp */
+            return X86EMUL_OKAY;
+        }
+        break;
+    }
+    }
 
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
-            {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
+    return X86EMUL_UNHANDLEABLE;
+}
 
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .validate            = priv_op_validate,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = pv_emul_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+};
 
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
+    eflags = regs->_eflags;
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
+    /*
+     * Un-mirror virtualized state from EFLAGS.
+     * Nothing we allow to be emulated can change TF, IF, or IOPL.
+     */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IF | X86_EFLAGS_IOPL)));
+    regs->_eflags |= X86_EFLAGS_IF;
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+
+    /* More strict than x86_emulate_wrapper(). */
+    ASSERT(ctxt.ctxt.event_pending == (rc == X86EMUL_EXCEPTION));
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
+            {
+                uint64_t val = rdtsc();
 
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.ctxt.retire.singlestep )
+            ctxt.bpmatch |= DR_STEP;
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
-        if ( v->arch.cpuid_faulting && !guest_kernel_mode(v, regs) )
-            goto fail;
-
-        pv_cpuid(regs);
-        break;
+        /* fall through */
+    case X86EMUL_RETRY:
+        return EXCRET_fault_fixed;
 
-    default:
-        goto fail;
+    case X86EMUL_EXCEPTION:
+        pv_inject_event(&ctxt.ctxt.event);
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3615,7 +3702,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1185,7 +1185,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
 
     /* Ensure the TSS has an io-bitmap-offset field. */
     generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
@@ -2506,6 +2506,21 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
+    if ( ops->validate )
+    {
+#ifndef NDEBUG
+        state.caller = __builtin_return_address(0);
+#endif
+        rc = ops->validate(&state, ctxt);
+#ifndef NDEBUG
+        state.caller = NULL;
+#endif
+        if ( rc == X86EMUL_DONE )
+            goto no_writeback;
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
@@ -2935,13 +2950,28 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_ins ||
-             ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_ins )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->read_io && ops->write )
         {
-            fail_if(ops->read_io == NULL);
+            rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_ins )
+            rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
+                              &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            fail_if(!ops->read_io || !ops->write);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
                 goto done;
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             dst.type = OP_MEM;
             nr_reps = 1;
         }
@@ -2959,14 +2989,30 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_outs ||
-             ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_outs )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->write_io )
         {
-            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
-                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
+            rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, dst.bytes,
+                            ctxt, ops);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_outs )
+            rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
+                               &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val,
+                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
                 goto done;
             fail_if(ops->write_io == NULL);
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
                 goto done;
             nr_reps = 1;
@@ -4039,7 +4085,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5445,9 +5495,7 @@ x86_emulate(
         break;
     }
 
- no_writeback:
-    /* Commit shadow register state. */
-    _regs.eflags &= ~EFLG_RF;
+ no_writeback: /* Commit shadow register state. */
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -5457,7 +5505,15 @@ x86_emulate(
     if ( (rc == X86EMUL_OKAY) && (ctxt->regs->eflags & EFLG_TF) )
         ctxt->retire.singlestep = true;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip = _regs.eip;
+        rc = X86EMUL_OKAY;
+    }
+
+    ctxt->regs->eflags &= ~EFLG_RF;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -146,6 +146,14 @@ struct __attribute__((__packed__)) segme
 #define X86EMUL_EXCEPTION      2
  /* Retry the emulation for some reason. No state modified. */
 #define X86EMUL_RETRY          3
+ /*
+  * Operation fully done by one of the hooks:
+  * - validate(): operation completed (except common insn retire logic)
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when used anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -156,6 +164,8 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_ymm  /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
 };
 
+struct x86_emulate_state;
+
 /*
  * These operations represent the instruction emulator's interface to memory,
  * I/O ports, privileged state... pretty much everything other than GPRs.
@@ -238,6 +248,14 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * validate: Post-decode, pre-emulate hook to allow caller controlled
+     * filtering.
+     */
+    int (*validate)(
+        const struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
      *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
      *  @reps:  [IN ] Maximum repetitions to be emulated.



[-- Attachment #2: x86-PV-priv-op-generic-emul.patch --]
[-- Type: text/plain, Size: 47717 bytes --]

x86/PV: use generic emulator for privileged instruction handling

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment).

Another small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too.

And then long-mode related bits now get hidden from the guest. This
should have been that way from the beginning, but becomes a requirement
now as the emulator's in_longmode() needs this to reflect guest view.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Rename priv_op_to_linear() to pv_emul_virt_to_linear() and change
    its return type. Refuse system segments and set just one of L and
    DB in 64-bit mode case of priv_op_read_segment(). Have
    pv_emul_cpuid() return X86EMUL_EXCEPTION in the CPUID faulting
    case. Add EFER behavioral change to description.
v3: Re-base. Do away with the special case pointer checks on the ->read
    and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
    for 32-bit guests (to avoid the emulator's in_longmode() returning
    a wrong result). Introduce and use ->validate() hook. Make
    formatting more consistent.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
     pv_inject_event(&event);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1336,7 +1333,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1378,7 +1375,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -2023,6 +2020,155 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+                                  unsigned int bytes, unsigned long limit,
+                                  enum x86_segment seg,
+                                  struct x86_emulate_ctxt *ctxt,
+                                  unsigned long *addr)
+{
+    int rc = X86EMUL_OKAY;
+
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+            rc = X86EMUL_EXCEPTION;
+        *addr = (uint32_t)*addr;
+    }
+    else if ( !__addr_ok(*addr) )
+        rc = X86EMUL_EXCEPTION;
+
+    if ( unlikely(rc == X86EMUL_EXCEPTION) )
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+
+    return rc;
+}
+
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+                              unsigned long offset,
+                              void *p_data,
+                              unsigned int bytes,
+                              struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                                x86_seg_cs, ctxt, &addr);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
+                           addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default: return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            if ( !is_x86_user_segment(seg) )
+                return X86EMUL_UNHANDLEABLE;
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+        {
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+            reg->attr.fields.l = 1;
+        }
+        else
+            reg->attr.fields.db = 1;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2269,6 +2415,238 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, x86_seg_es, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(PFEC_write_access,
+                               addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, seg, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2409,6 +2787,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2436,6 +2815,28 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior, defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_EFER:
+        *val = read_efer();
+        if ( is_pv_32bit_domain(currd) )
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2539,7 +2940,6 @@ static int priv_op_read_msr(unsigned int
         if ( rc )
             return X86EMUL_OKAY;
         /* fall through */
-    case MSR_EFER:
     normal:
         /* Everyone can read the MSR space. */
         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -2761,11 +3161,41 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
+
+    return X86EMUL_OKAY;
+}
+
 int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
                   unsigned int *edx, struct x86_emulate_ctxt *ctxt)
 {
     struct cpu_user_regs regs = *ctxt->regs;
 
+    /*
+     * x86_emulate uses this function to query CPU features for its own
+     * internal use. Make sure we're actually emulating CPUID before checking
+     * for emulated CPUID faulting.
+     */
+    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
+    {
+        const struct vcpu *curr = current;
+
+        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
+        if ( curr->arch.cpuid_faulting &&
+             !guest_kernel_mode(curr, ctxt->regs) )
+            return X86EMUL_EXCEPTION;
+    }
+
     regs._eax = *eax;
     regs._ecx = *ecx;
 
@@ -2779,497 +3209,153 @@ int pv_emul_cpuid(unsigned int *eax, uns
     return X86EMUL_OKAY;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        pv_inject_page_fault(0, _ptr + sizeof(_x) - _rc);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_validate(const struct x86_emulate_state *state,
+                            struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
-
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
+    switch ( ctxt->opcode )
     {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
-
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
-
-        port = (u16)regs->edx;
+    case 0x6c ... 0x6f: /* ins / outs */
+    case 0xe4 ... 0xe7: /* in / out (immediate port) */
+    case 0xec ... 0xef: /* in / out (port in %dx) */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x20) ...
+         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+        return X86EMUL_OKAY;
 
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(PFEC_write_access,
-                                     data_base + rd_ad(edi) + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
+    case 0xfa: case 0xfb: /* cli / sti */
+        if ( !iopl_ok(current, ctxt->regs) )
             break;
-
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(0, data_base + rd_ad(esi)
-                                     + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
-        }
-
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
-
-        goto done;
-    }
-
-    /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
-     */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
-
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
-    {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
-    }
+        return X86EMUL_DONE;
 
-    /* No decode of this single-byte opcode. */
-    goto fail;
+    case X86EMUL_OPC(0x0f, 0x01):
+    {
+        unsigned int modrm_rm, modrm_reg;
 
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
+        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+             (modrm_rm & 7) != 1 )
             break;
-        case 0xd1: /* XSETBV */
+        switch ( modrm_reg & 7 )
         {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+        case 2: /* xsetbv */
+        case 7: /* rdtscp */
+            return X86EMUL_OKAY;
+        }
+        break;
+    }
+    }
 
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
-            {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
+    return X86EMUL_UNHANDLEABLE;
+}
 
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .validate            = priv_op_validate,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = pv_emul_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+};
 
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
+    eflags = regs->_eflags;
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
+    /*
+     * Un-mirror virtualized state from EFLAGS.
+     * Nothing we allow to be emulated can change TF, IF, or IOPL.
+     */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IF | X86_EFLAGS_IOPL)));
+    regs->_eflags |= X86_EFLAGS_IF;
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+
+    /* More strict than x86_emulate_wrapper(). */
+    ASSERT(ctxt.ctxt.event_pending == (rc == X86EMUL_EXCEPTION));
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
+            {
+                uint64_t val = rdtsc();
 
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.ctxt.retire.singlestep )
+            ctxt.bpmatch |= DR_STEP;
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
-        if ( v->arch.cpuid_faulting && !guest_kernel_mode(v, regs) )
-            goto fail;
-
-        pv_cpuid(regs);
-        break;
+        /* fall through */
+    case X86EMUL_RETRY:
+        return EXCRET_fault_fixed;
 
-    default:
-        goto fail;
+    case X86EMUL_EXCEPTION:
+        pv_inject_event(&ctxt.ctxt.event);
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3615,7 +3702,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1185,7 +1185,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
 
     /* Ensure the TSS has an io-bitmap-offset field. */
     generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
@@ -2506,6 +2506,21 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
+    if ( ops->validate )
+    {
+#ifndef NDEBUG
+        state.caller = __builtin_return_address(0);
+#endif
+        rc = ops->validate(&state, ctxt);
+#ifndef NDEBUG
+        state.caller = NULL;
+#endif
+        if ( rc == X86EMUL_DONE )
+            goto no_writeback;
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
@@ -2935,13 +2950,28 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_ins ||
-             ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_ins )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->read_io && ops->write )
         {
-            fail_if(ops->read_io == NULL);
+            rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_ins )
+            rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
+                              &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            fail_if(!ops->read_io || !ops->write);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
                 goto done;
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             dst.type = OP_MEM;
             nr_reps = 1;
         }
@@ -2959,14 +2989,30 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_outs ||
-             ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_outs )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->write_io )
         {
-            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
-                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
+            rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, dst.bytes,
+                            ctxt, ops);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_outs )
+            rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
+                               &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val,
+                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
                 goto done;
             fail_if(ops->write_io == NULL);
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
                 goto done;
             nr_reps = 1;
@@ -4039,7 +4085,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5445,9 +5495,7 @@ x86_emulate(
         break;
     }
 
- no_writeback:
-    /* Commit shadow register state. */
-    _regs.eflags &= ~EFLG_RF;
+ no_writeback: /* Commit shadow register state. */
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -5457,7 +5505,15 @@ x86_emulate(
     if ( (rc == X86EMUL_OKAY) && (ctxt->regs->eflags & EFLG_TF) )
         ctxt->retire.singlestep = true;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip = _regs.eip;
+        rc = X86EMUL_OKAY;
+    }
+
+    ctxt->regs->eflags &= ~EFLG_RF;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -146,6 +146,14 @@ struct __attribute__((__packed__)) segme
 #define X86EMUL_EXCEPTION      2
  /* Retry the emulation for some reason. No state modified. */
 #define X86EMUL_RETRY          3
+ /*
+  * Operation fully done by one of the hooks:
+  * - validate(): operation completed (except common insn retire logic)
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when used anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -156,6 +164,8 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_ymm  /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
 };
 
+struct x86_emulate_state;
+
 /*
  * These operations represent the instruction emulator's interface to memory,
  * I/O ports, privileged state... pretty much everything other than GPRs.
@@ -238,6 +248,14 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * validate: Post-decode, pre-emulate hook to allow caller controlled
+     * filtering.
+     */
+    int (*validate)(
+        const struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
      *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
      *  @reps:  [IN ] Maximum repetitions to be emulated.

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception()
  2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
                   ` (3 preceding siblings ...)
  2016-12-13 11:28 ` [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
@ 2016-12-13 11:29 ` Jan Beulich
  2016-12-13 15:54   ` Andrew Cooper
  4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:29 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 2164 bytes --]

... over editing the error code and calling do_guest_trap().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -3565,20 +3565,17 @@ static void emulate_gate_op(struct cpu_u
           ((ar >> 13) & 3) > (regs->cs & 3) :
           ((ar >> 13) & 3) != (regs->cs & 3)) )
     {
-        regs->error_code = sel;
-        do_guest_trap(TRAP_gp_fault, regs);
+        pv_inject_hw_exception(TRAP_gp_fault, sel);
         return;
     }
     if ( !(ar & _SEGMENT_P) )
     {
-        regs->error_code = sel;
-        do_guest_trap(TRAP_no_segment, regs);
+        pv_inject_hw_exception(TRAP_no_segment, sel);
         return;
     }
     if ( off > limit )
     {
-        regs->error_code = 0;
-        do_guest_trap(TRAP_gp_fault, regs);
+        pv_inject_hw_exception(TRAP_gp_fault, 0);
         return;
     }
 
@@ -3617,15 +3614,13 @@ static void emulate_gate_op(struct cpu_u
                  (ar & _SEGMENT_CODE) ||
                  !(ar & _SEGMENT_WR) )
             {
-                regs->error_code = ss & ~3;
-                do_guest_trap(TRAP_invalid_tss, regs);
+                pv_inject_hw_exception(TRAP_invalid_tss, ss & ~3);
                 return;
             }
             if ( !(ar & _SEGMENT_P) ||
                  !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
             {
-                regs->error_code = ss & ~3;
-                do_guest_trap(TRAP_stack_error, regs);
+                pv_inject_hw_exception(TRAP_stack_error, ss & ~3);
                 return;
             }
             stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
@@ -3681,8 +3676,7 @@ static void emulate_gate_op(struct cpu_u
             }
             if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
             {
-                regs->error_code = 0;
-                do_guest_trap(TRAP_stack_error, regs);
+                pv_inject_hw_exception(TRAP_stack_error, 0);
                 return;
             }
             stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);




[-- Attachment #2: x86-PV-hw-exception.patch --]
[-- Type: text/plain, Size: 2201 bytes --]

x86/PV: prefer pv_inject_hw_exception()

... over editing the error code and calling do_guest_trap().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -3565,20 +3565,17 @@ static void emulate_gate_op(struct cpu_u
           ((ar >> 13) & 3) > (regs->cs & 3) :
           ((ar >> 13) & 3) != (regs->cs & 3)) )
     {
-        regs->error_code = sel;
-        do_guest_trap(TRAP_gp_fault, regs);
+        pv_inject_hw_exception(TRAP_gp_fault, sel);
         return;
     }
     if ( !(ar & _SEGMENT_P) )
     {
-        regs->error_code = sel;
-        do_guest_trap(TRAP_no_segment, regs);
+        pv_inject_hw_exception(TRAP_no_segment, sel);
         return;
     }
     if ( off > limit )
     {
-        regs->error_code = 0;
-        do_guest_trap(TRAP_gp_fault, regs);
+        pv_inject_hw_exception(TRAP_gp_fault, 0);
         return;
     }
 
@@ -3617,15 +3614,13 @@ static void emulate_gate_op(struct cpu_u
                  (ar & _SEGMENT_CODE) ||
                  !(ar & _SEGMENT_WR) )
             {
-                regs->error_code = ss & ~3;
-                do_guest_trap(TRAP_invalid_tss, regs);
+                pv_inject_hw_exception(TRAP_invalid_tss, ss & ~3);
                 return;
             }
             if ( !(ar & _SEGMENT_P) ||
                  !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
             {
-                regs->error_code = ss & ~3;
-                do_guest_trap(TRAP_stack_error, regs);
+                pv_inject_hw_exception(TRAP_stack_error, ss & ~3);
                 return;
             }
             stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
@@ -3681,8 +3676,7 @@ static void emulate_gate_op(struct cpu_u
             }
             if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
             {
-                regs->error_code = 0;
-                do_guest_trap(TRAP_stack_error, regs);
+                pv_inject_hw_exception(TRAP_stack_error, 0);
                 return;
             }
             stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks
  2016-12-13 11:27 ` [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
@ 2016-12-13 15:15   ` Paul Durrant
  0 siblings, 0 replies; 14+ messages in thread
From: Paul Durrant @ 2016-12-13 15:15 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 13 December 2016 11:28
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
> <Paul.Durrant@citrix.com>
> Subject: [PATCH v4 2/5] x86emul: generalize exception handling for rep_*
> hooks
> 
> If any of those hooks returns X86EMUL_EXCEPTION, some register state
> should still get updated if some iterations have been performed (but
> the rIP update will get suppressed if not all of them did get handled).
> This updating is done by register_address_increment() and
> __put_rep_prefix() (which hence must no longer be bypassed). As a
> result put_rep_prefix() can then skip most of the writeback, but needs
> to ensure proper completion of the executed instruction.
> 
> While on the HVM side the VA -> LA -> PA translation process ensures
> that an exception would be raised on the first iteration only, doing so
> would unduly complicate the PV side code about to be added.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> v4: Move changes to xen/arch/x86/hvm/emulate.c here (from a later
>     patch). Re-base.
> v3: Broken out from a later patch.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
>      {
>          if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
>              return X86EMUL_RETRY;
> +        *reps = 0;
>          x86_emul_pagefault(pfec, addr, &hvmemul_ctxt->ctxt);
>          return X86EMUL_EXCEPTION;
>      }
> @@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
>              if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
>                  return X86EMUL_RETRY;
>              done /= bytes_per_rep;
> +            *reps = done;
>              if ( done == 0 )
>              {
>                  ASSERT(!reverse);
> @@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
>                  x86_emul_pagefault(pfec, addr & PAGE_MASK, &hvmemul_ctxt-
> >ctxt);
>                  return X86EMUL_EXCEPTION;
>              }
> -            *reps = done;
>              break;
>          }
> 
> @@ -572,6 +573,7 @@ static int hvmemul_virtual_to_linear(
>       * neither know the exact error code to be used, nor can we easily
>       * determine the kind of exception (#GP or #TS) in that case.
>       */
> +    *reps = 0;
>      if ( is_x86_user_segment(seg) )
>          x86_emul_hw_exception((seg == x86_seg_ss)
>                                ? TRAP_stack_error
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -971,7 +971,11 @@ static void __put_rep_prefix(
> 
>  #define put_rep_prefix(reps_completed) ({                               \
>      if ( rep_prefix() )                                                 \
> +    {                                                                   \
>          __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
> +        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
> +            goto no_writeback;                                          \
> +    }                                                                   \
>  })
> 
>  /* Clip maximum repetitions so that the index register at most just wraps. */
> @@ -2919,14 +2923,9 @@ x86_emulate(
>          dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
>          if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
>              goto done;
> -        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
> +        if ( (nr_reps == 1) || !ops->rep_ins ||
>               ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
> -                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
> -        {
> -            if ( rc != 0 )
> -                goto done;
> -        }
> -        else
> +                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
>          {
>              fail_if(ops->read_io == NULL);
>              if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
> @@ -2936,6 +2935,8 @@ x86_emulate(
>          }
>          register_address_adjust(_regs.edi, nr_reps * dst.bytes);
>          put_rep_prefix(nr_reps);
> +        if ( rc != X86EMUL_OKAY )
> +            goto done;
>          break;
>      }
> 
> @@ -2946,14 +2947,9 @@ x86_emulate(
>          ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
>          if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
>              goto done;
> -        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
> +        if ( (nr_reps == 1) || !ops->rep_outs ||
>               ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
> -                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
> -        {
> -            if ( rc != 0 )
> -                goto done;
> -        }
> -        else
> +                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
>          {
>              if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
>                                    &dst.val, dst.bytes, ctxt, ops)) != 0 )
> @@ -2965,6 +2961,8 @@ x86_emulate(
>          }
>          register_address_adjust(_regs.esi, nr_reps * dst.bytes);
>          put_rep_prefix(nr_reps);
> +        if ( rc != X86EMUL_OKAY )
> +            goto done;
>          break;
>      }
> 
> @@ -3187,15 +3185,10 @@ x86_emulate(
>          dst.mem.seg = x86_seg_es;
>          dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
>          src.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
> -        if ( (nr_reps > 1) && (ops->rep_movs != NULL) &&
> +        if ( (nr_reps == 1) || !ops->rep_movs ||
>               ((rc = ops->rep_movs(ea.mem.seg, src.mem.off,
>                                    dst.mem.seg, dst.mem.off, dst.bytes,
> -                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
> -        {
> -            if ( rc != 0 )
> -                goto done;
> -        }
> -        else
> +                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
>          {
>              if ( (rc = read_ulong(ea.mem.seg, src.mem.off,
>                                    &dst.val, dst.bytes, ctxt, ops)) != 0 )
> @@ -3206,6 +3199,8 @@ x86_emulate(
>          register_address_adjust(_regs.esi, nr_reps * dst.bytes);
>          register_address_adjust(_regs.edi, nr_reps * dst.bytes);
>          put_rep_prefix(nr_reps);
> +        if ( rc != X86EMUL_OKAY )
> +            goto done;
>          break;
>      }
> 
> @@ -3244,11 +3239,12 @@ x86_emulate(
>              dst.val = src.val;
>              dst.type = OP_MEM;
>              nr_reps = 1;
> +            rc = X86EMUL_OKAY;
>          }
> -        else if ( rc != X86EMUL_OKAY )
> -            goto done;
>          register_address_adjust(_regs.edi, nr_reps * dst.bytes);
>          put_rep_prefix(nr_reps);
> +        if ( rc != X86EMUL_OKAY )
> +            goto done;
>          break;
>      }
> 
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-13 11:26 ` [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-12-13 15:50   ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2016-12-13 15:50 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 13/12/16 11:26, Jan Beulich wrote:
> ... instead of custom handling. Note that we can't use generic
> emulation, as the emulator's far branch support is rather rudimentary
> at this point in time.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional
  2016-12-13 11:28 ` [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
@ 2016-12-13 15:52   ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2016-12-13 15:52 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 13/12/16 11:28, Jan Beulich wrote:
> While the read and fetch hooks are basically unavoidable, write and
> cmpxchg aren't really needed by that many insns.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception()
  2016-12-13 11:29 ` [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception() Jan Beulich
@ 2016-12-13 15:54   ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2016-12-13 15:54 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 13/12/16 11:29, Jan Beulich wrote:
> ... over editing the error code and calling do_guest_trap().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Thanks for doing this.  It was somewhere on my TODO list.  I'd like to
drop do_guest_trap() entirely, eventually.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-13 11:28 ` [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
@ 2016-12-13 16:53   ` Andrew Cooper
  2016-12-13 17:16     ` Jan Beulich
  2016-12-14  7:35     ` Jan Beulich
  0 siblings, 2 replies; 14+ messages in thread
From: Andrew Cooper @ 2016-12-13 16:53 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 13/12/16 11:28, Jan Beulich wrote:
> +static int priv_op_insn_fetch(enum x86_segment seg,
> +                              unsigned long offset,
> +                              void *p_data,
> +                              unsigned int bytes,
> +                              struct x86_emulate_ctxt *ctxt)
> +{
> +    const struct priv_op_ctxt *poc =
> +        container_of(ctxt, struct priv_op_ctxt, ctxt);
> +    unsigned int rc;
> +    unsigned long addr = poc->cs.base + offset;
> +
> +    ASSERT(seg == x86_seg_cs);
> +
> +    /* We don't mean to emulate any branches. */
> +    if ( !bytes )
> +        return X86EMUL_UNHANDLEABLE;
> +
> +    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
> +                                x86_seg_cs, ctxt, &addr);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
> +    {
> +        x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
> +                           addr + bytes - rc, ctxt);

Please can we retain the 0 and comment here.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1185,7 +1185,7 @@ static int ioport_access_check(
>  
>      fail_if(ops->read_segment == NULL);
>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
> -        return rc;
> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;

Please have at least a comment here

/* Used by the PV path to defer the port permission check to the ioport
hooks. */

>  
>      /* Ensure the TSS has an io-bitmap-offset field. */
>      generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
>

Other than that, subject to double checking the IOPL behaviour,
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-13 16:53   ` Andrew Cooper
@ 2016-12-13 17:16     ` Jan Beulich
  2016-12-14  7:35     ` Jan Beulich
  1 sibling, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 17:16 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 13.12.16 at 17:53, <andrew.cooper3@citrix.com> wrote:
> On 13/12/16 11:28, Jan Beulich wrote:
>> +static int priv_op_insn_fetch(enum x86_segment seg,
>> +                              unsigned long offset,
>> +                              void *p_data,
>> +                              unsigned int bytes,
>> +                              struct x86_emulate_ctxt *ctxt)
>> +{
>> +    const struct priv_op_ctxt *poc =
>> +        container_of(ctxt, struct priv_op_ctxt, ctxt);
>> +    unsigned int rc;
>> +    unsigned long addr = poc->cs.base + offset;
>> +
>> +    ASSERT(seg == x86_seg_cs);
>> +
>> +    /* We don't mean to emulate any branches. */
>> +    if ( !bytes )
>> +        return X86EMUL_UNHANDLEABLE;
>> +
>> +    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
>> +                                x86_seg_cs, ctxt, &addr);
>> +    if ( rc != X86EMUL_OKAY )
>> +        return rc;
>> +
>> +    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
>> +    {
>> +        x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
>> +                           addr + bytes - rc, ctxt);
> 
> Please can we retain the 0 and comment here.

Oh, sure - I should have extended the similar change to patch 1
to here.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -1185,7 +1185,7 @@ static int ioport_access_check(
>>  
>>      fail_if(ops->read_segment == NULL);
>>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
>> -        return rc;
>> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
> 
> Please have at least a comment here
> 
> /* Used by the PV path to defer the port permission check to the ioport
> hooks. */
> 
>>  
>>      /* Ensure the TSS has an io-bitmap-offset field. */
>>      generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
>>
> 
> Other than that, subject to double checking the IOPL behaviour,
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks, Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-13 16:53   ` Andrew Cooper
  2016-12-13 17:16     ` Jan Beulich
@ 2016-12-14  7:35     ` Jan Beulich
  2016-12-14  9:26       ` Andrew Cooper
  1 sibling, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-14  7:35 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 13.12.16 at 17:53, <andrew.cooper3@citrix.com> wrote:
> On 13/12/16 11:28, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -1185,7 +1185,7 @@ static int ioport_access_check(
>>  
>>      fail_if(ops->read_segment == NULL);
>>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
>> -        return rc;
>> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
> 
> Please have at least a comment here
> 
> /* Used by the PV path to defer the port permission check to the ioport
> hooks. */

I'll probably use a slight variation thereof, as I'd prefer to not
mention specific uses (being too easy to go stale).

>>      /* Ensure the TSS has an io-bitmap-offset field. */
>>      generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
>>
> 
> Other than that, subject to double checking the IOPL behaviour,
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

These tests are fine. Of the pv-cpuid-faulting ones, which you had
asked for during v3 review, the pv32pae one gets skipped (no
faulting available), and the pv64 one even has a problem getting at
the guest console. I am getting that same problem randomly for
other tests too, though, if I run multiple ones in one batch. No idea
what to do about that. Running that one test in isolation doesn't
eliminate the problem though; the hypervisor log tells me that the
test has been skipped just like the 32-bit one (as one would expect).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-14  7:35     ` Jan Beulich
@ 2016-12-14  9:26       ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2016-12-14  9:26 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 14/12/2016 07:35, Jan Beulich wrote:
>>>> On 13.12.16 at 17:53, <andrew.cooper3@citrix.com> wrote:
>> On 13/12/16 11:28, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -1185,7 +1185,7 @@ static int ioport_access_check(
>>>  
>>>      fail_if(ops->read_segment == NULL);
>>>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
>>> -        return rc;
>>> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
>> Please have at least a comment here
>>
>> /* Used by the PV path to defer the port permission check to the ioport
>> hooks. */
> I'll probably use a slight variation thereof, as I'd prefer to not
> mention specific uses (being too easy to go stale).
>
>>>      /* Ensure the TSS has an io-bitmap-offset field. */
>>>      generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
>>>
>> Other than that, subject to double checking the IOPL behaviour,
>> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> These tests are fine.

I have to admit that I am now curious as to how.  By pretending that the
iopl of the guest kernel is 0 rather than 1, I'd expect the case of
vIOPL of 0 and real CPL 1 to break. The in-guest behaviour of this
scenario was to raise #GP faults with the guest kernel.

>  Of the pv-cpuid-faulting ones, which you had
> asked for during v3 review, the pv32pae one gets skipped (no
> faulting available), and the pv64 one even has a problem getting at
> the guest console. I am getting that same problem randomly for
> other tests too, though, if I run multiple ones in one batch. No idea
> what to do about that. Running that one test in isolation doesn't
> eliminate the problem though; the hypervisor log tells me that the
> test has been skipped just like the 32-bit one (as one would expect).

That is a race condition with `xl console`, which isn't sufficiently
synchronised with the previous `xl create` when setting up console
information in xenstore.

There is an alternative --results-mode=logfile and optional
--logfile-dir=/path if you have xenconsoled configured in a non-standard
way.

http://xenbits.xen.org/gitweb/?p=xtf.git;a=commitdiff;h=97541e1fde20b6823fa146357a57ec7c6f802c05

However, this isn't the default because it puts a large quantity of
serialisation in makes back-to-back running of tests an order of
magnitude slower, but it is the mode uses by OSSTest.

I am still trying to find a better solution to this problem.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-12-14  9:26 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
2016-12-13 11:26 ` [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
2016-12-13 15:50   ` Andrew Cooper
2016-12-13 11:27 ` [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
2016-12-13 15:15   ` Paul Durrant
2016-12-13 11:28 ` [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
2016-12-13 15:52   ` Andrew Cooper
2016-12-13 11:28 ` [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
2016-12-13 16:53   ` Andrew Cooper
2016-12-13 17:16     ` Jan Beulich
2016-12-14  7:35     ` Jan Beulich
2016-12-14  9:26       ` Andrew Cooper
2016-12-13 11:29 ` [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception() Jan Beulich
2016-12-13 15:54   ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.