All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/5] x86: extend use of generic insn emulator for PV
@ 2016-12-06 11:07 Jan Beulich
  2016-12-06 11:12 ` [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

1: x86/32on64: use generic instruction decoding for call gate emulation
2: x86emul: don't assume a memory operand
3: x86emul: generalize exception handling for rep_* hooks
4: x86emul: make write and cmpxchg hooks optional
5: x86/PV: use generic emulator for privileged instruction handling

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Apart from the last patch, which heavily changed (see there),
    it's largely re-basing and breaking out of patch 3 from patch 5.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-06 11:07 [PATCH v3 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
@ 2016-12-06 11:12 ` Jan Beulich
  2016-12-06 11:56   ` Andrew Cooper
  2016-12-06 11:13 ` [PATCH v3 2/5] x86emul: don't assume a memory operand Jan Beulich
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:12 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 14034 bytes --]

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3313,13 +3314,92 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( is_pv_32bit_vcpu(current) )
+        addr = (uint32_t)addr;
+
+    if ( !__addr_ok(addr) ||
+         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
+                           ? PFEC_insn_fetch : 0,
+                           addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3341,7 +3421,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3350,179 +3431,59 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
+            do_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
+            enum x86_segment seg;
+
+        case 5:
             ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
+            /* fall through */
+        case 3:
             ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
-            break;
-        case 0xff:
-            {
-                unsigned int modrm;
-
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-    {
-        do_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
+    if ( rc == X86EMUL_EXCEPTION )
+       return;
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3663,7 +3624,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5560,6 +5560,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5577,6 +5585,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -629,9 +629,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);



[-- Attachment #2: x86-32on64-gate-op-generic-decode.patch --]
[-- Type: text/plain, Size: 14102 bytes --]

x86/32on64: use generic instruction decoding for call gate emulation

... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
 #include <xen/init.h>
 #include <xen/sched.h>
 #include <xen/lib.h>
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/mm.h>
 #include <xen/console.h>
@@ -3313,13 +3314,92 @@ static inline int check_stack_limit(unsi
             (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
 }
 
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( is_pv_32bit_vcpu(current) )
+        addr = (uint32_t)addr;
+
+    if ( !__addr_ok(addr) ||
+         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
+                           ? PFEC_insn_fetch : 0,
+                           addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void emulate_gate_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, opnd_sel;
-    unsigned int op_default, op_bytes, ad_default, ad_bytes;
-    unsigned long off, eip, opnd_off, base, limit;
-    int jump;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3341,7 +3421,8 @@ static void emulate_gate_op(struct cpu_u
      * Decode instruction (and perhaps operand) to determine RPL,
      * whether this is a jump or a call, and the call return offset.
      */
-    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
          !(ar & _SEGMENT_S) ||
          !(ar & _SEGMENT_P) ||
          !(ar & _SEGMENT_CODE) )
@@ -3350,179 +3431,59 @@ static void emulate_gate_op(struct cpu_u
         return;
     }
 
-    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
-    ad_default = ad_bytes = op_default;
-    opnd_sel = opnd_off = 0;
-    jump = -1;
-    for ( eip = regs->eip; eip - regs->_eip < 10; )
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
+            do_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
     {
-        switch ( insn_fetch(u8, base, eip, limit) )
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
         {
-        case 0x66: /* operand-size override */
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            opnd_sel = regs->cs;
-            ASSERT(opnd_sel);
-            continue;
-        case 0x3e: /* DS override */
-            opnd_sel = read_sreg(ds);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x26: /* ES override */
-            opnd_sel = read_sreg(es);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x64: /* FS override */
-            opnd_sel = read_sreg(fs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x65: /* GS override */
-            opnd_sel = read_sreg(gs);
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0x36: /* SS override */
-            opnd_sel = regs->ss;
-            if ( !opnd_sel )
-                opnd_sel = dpl;
-            continue;
-        case 0xea:
+            enum x86_segment seg;
+
+        case 5:
             ++jump;
-            /* FALLTHROUGH */
-        case 0x9a:
+            /* fall through */
+        case 3:
             ++jump;
-            opnd_sel = regs->cs;
-            opnd_off = eip;
-            ad_bytes = ad_default;
-            eip += op_bytes + 2;
-            break;
-        case 0xff:
-            {
-                unsigned int modrm;
-
-                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
-                {
-                case 0x28: case 0x68: case 0xa8:
-                    ++jump;
-                    /* FALLTHROUGH */
-                case 0x18: case 0x58: case 0x98:
-                    ++jump;
-                    if ( ad_bytes != 2 )
-                    {
-                        if ( (modrm & 7) == 4 )
-                        {
-                            unsigned int sib;
-                            sib = insn_fetch(u8, base, eip, limit);
-
-                            modrm = (modrm & ~7) | (sib & 7);
-                            if ( ((sib >>= 3) & 7) != 4 )
-                                opnd_off = *(unsigned long *)
-                                    decode_register(sib & 7, regs, 0);
-                            opnd_off <<= sib >> 3;
-                        }
-                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)
-                                decode_register(modrm & 7, regs, 0);
-                        else
-                            modrm |= 0x87;
-                        if ( !opnd_sel )
-                        {
-                            switch ( modrm & 7 )
-                            {
-                            default:
-                                opnd_sel = read_sreg(ds);
-                                break;
-                            case 4: case 5:
-                                opnd_sel = regs->ss;
-                                break;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 1: case 7:
-                            opnd_off = regs->ebx;
-                            break;
-                        case 6:
-                            if ( !(modrm & 0xc0) )
-                                modrm |= 0x80;
-                            else
-                        case 2: case 3:
-                            {
-                                opnd_off = regs->ebp;
-                                if ( !opnd_sel )
-                                    opnd_sel = regs->ss;
-                            }
-                            break;
-                        }
-                        if ( !opnd_sel )
-                            opnd_sel = read_sreg(ds);
-                        switch ( modrm & 7 )
-                        {
-                        case 0: case 2: case 4:
-                            opnd_off += regs->esi;
-                            break;
-                        case 1: case 3: case 5:
-                            opnd_off += regs->edi;
-                            break;
-                        }
-                    }
-                    switch ( modrm & 0xc0 )
-                    {
-                    case 0x40:
-                        opnd_off += insn_fetch(s8, base, eip, limit);
-                        break;
-                    case 0x80:
-                        if ( ad_bytes > 2 )
-                            opnd_off += insn_fetch(s32, base, eip, limit);
-                        else
-                            opnd_off += insn_fetch(s16, base, eip, limit);
-                        break;
-                    }
-                    if ( ad_bytes == 4 )
-                        opnd_off = (unsigned int)opnd_off;
-                    else if ( ad_bytes == 2 )
-                        opnd_off = (unsigned short)opnd_off;
-                    break;
-                }
-            }
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
             break;
         }
         break;
     }
 
-    if ( jump < 0 )
-    {
- fail:
-        do_guest_trap(TRAP_gp_fault, regs);
- skip:
-        return;
-    }
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
 
-    if ( (opnd_sel != regs->cs &&
-          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-    {
-        do_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
+    if ( rc == X86EMUL_EXCEPTION )
+       return;
 
-    opnd_off += op_bytes;
-#define ad_default ad_bytes
-    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
-    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
     {
         do_guest_trap(TRAP_gp_fault, regs);
         return;
@@ -3663,7 +3624,7 @@ static void emulate_gate_op(struct cpu_u
             }
         }
         push(regs->cs);
-        push(eip);
+        push(regs->eip + insn_len);
 #undef push
         regs->esp = esp;
         regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5560,6 +5560,14 @@ void x86_emulate_free_state(struct x86_e
 }
 #endif
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+    check_state(state);
+
+    return state->op_bytes << 3;
+}
+
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg)
@@ -5577,6 +5585,33 @@ x86_insn_modrm(const struct x86_emulate_
     return state->modrm_mod;
 }
 
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg)
+{
+    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+    check_state(state);
+
+    return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+    check_state(state);
+
+    switch ( nr )
+    {
+    case 0:
+        return state->imm1;
+    case 1:
+        return state->imm2;
+    }
+
+    return 0;
+}
+
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -629,9 +629,17 @@ x86_decode_insn(
         void *p_data, unsigned int bytes,
         struct x86_emulate_ctxt *ctxt));
 
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
 int
 x86_insn_modrm(const struct x86_emulate_state *state,
                unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+                    enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+                   unsigned int nr);
 unsigned int
 x86_insn_length(const struct x86_emulate_state *state,
                 const struct x86_emulate_ctxt *ctxt);

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 2/5] x86emul: don't assume a memory operand
  2016-12-06 11:07 [PATCH v3 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
  2016-12-06 11:12 ` [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-12-06 11:13 ` Jan Beulich
  2016-12-06 16:49   ` Andrew Cooper
  2016-12-06 11:14 ` [PATCH v3 3/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 3890 bytes --]

Especially for x86_insn_operand_ea() to return dependable segment
information even when the caller didn't consider applicability, we
shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
and set it to OP_MEM when we actually encounter memory like operands.

This requires to eliminate the XSA-123 fix, which has been no longer
necessary since the elimination of the union in commit dd766684e7. That
in turn allows restricting the scope of override_seg to x86_decode().
At this occasion also make it have a proper type, instead of plain int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1770,7 +1770,6 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
-    int override_seg;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -1806,7 +1805,6 @@ struct x86_emulate_state {
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
-#define override_seg (state->override_seg)
 #define ea (state->ea)
 
 static int
@@ -1835,6 +1833,7 @@ x86_decode_onebyte(
     case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
     case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         /* Source EA is not encoded via ModRM. */
+        ea.type = OP_MEM;
         ea.mem.off = insn_fetch_bytes(ad_bytes);
         break;
 
@@ -1925,11 +1924,11 @@ x86_decode(
 {
     uint8_t b, d, sib, sib_index, sib_base;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
+    enum x86_segment override_seg = x86_seg_none;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
-    override_seg = -1;
-    ea.type = OP_MEM;
+    ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
     ea.reg = PTR_POISON;
     state->regs = ctxt->regs;
@@ -2245,6 +2244,7 @@ x86_decode(
         else if ( ad_bytes == 2 )
         {
             /* 16-bit ModR/M decode. */
+            ea.type = OP_MEM;
             switch ( modrm_rm )
             {
             case 0:
@@ -2295,6 +2295,7 @@ x86_decode(
         else
         {
             /* 32/64-bit ModR/M decode. */
+            ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
                 sib = insn_fetch_type(uint8_t);
@@ -2359,7 +2360,7 @@ x86_decode(
         }
     }
 
-    if ( override_seg != -1 && ea.type == OP_MEM )
+    if ( override_seg != x86_seg_none )
         ea.mem.seg = override_seg;
 
     /* Fetch the immediate operand, if present. */
@@ -4432,13 +4433,11 @@ x86_emulate(
             generate_exception_if(limit < sizeof(long) ||
                                   (limit & (limit - 1)), EXC_UD);
             base &= ~(limit - 1);
-            if ( override_seg == -1 )
-                override_seg = x86_seg_ds;
             if ( ops->rep_stos )
             {
                 unsigned long nr_reps = limit / sizeof(zero);
 
-                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
+                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
                                    &nr_reps, ctxt);
                 if ( rc == X86EMUL_OKAY )
                 {
@@ -4450,7 +4449,7 @@ x86_emulate(
             }
             while ( limit )
             {
-                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
+                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 base += sizeof(zero);
@@ -5477,7 +5476,6 @@ x86_emulate(
 #undef rex_prefix
 #undef lock_prefix
 #undef vex
-#undef override_seg
 #undef ea
 
 static void __init __maybe_unused build_assertions(void)




[-- Attachment #2: x86emul-ea-init-OP_NONE.patch --]
[-- Type: text/plain, Size: 3926 bytes --]

x86emul: don't assume a memory operand

Especially for x86_insn_operand_ea() to return dependable segment
information even when the caller didn't consider applicability, we
shouldn't have ea.type start out as OP_MEM. Make it OP_NONE instead,
and set it to OP_MEM when we actually encounter memory like operands.

This requires to eliminate the XSA-123 fix, which has been no longer
necessary since the elimination of the union in commit dd766684e7. That
in turn allows restricting the scope of override_seg to x86_decode().
At this occasion also make it have a proper type, instead of plain int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1770,7 +1770,6 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
-    int override_seg;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -1806,7 +1805,6 @@ struct x86_emulate_state {
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
-#define override_seg (state->override_seg)
 #define ea (state->ea)
 
 static int
@@ -1835,6 +1833,7 @@ x86_decode_onebyte(
     case 0xa0: case 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */
     case 0xa2: case 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */
         /* Source EA is not encoded via ModRM. */
+        ea.type = OP_MEM;
         ea.mem.off = insn_fetch_bytes(ad_bytes);
         break;
 
@@ -1925,11 +1924,11 @@ x86_decode(
 {
     uint8_t b, d, sib, sib_index, sib_base;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
+    enum x86_segment override_seg = x86_seg_none;
     int rc = X86EMUL_OKAY;
 
     memset(state, 0, sizeof(*state));
-    override_seg = -1;
-    ea.type = OP_MEM;
+    ea.type = OP_NONE;
     ea.mem.seg = x86_seg_ds;
     ea.reg = PTR_POISON;
     state->regs = ctxt->regs;
@@ -2245,6 +2244,7 @@ x86_decode(
         else if ( ad_bytes == 2 )
         {
             /* 16-bit ModR/M decode. */
+            ea.type = OP_MEM;
             switch ( modrm_rm )
             {
             case 0:
@@ -2295,6 +2295,7 @@ x86_decode(
         else
         {
             /* 32/64-bit ModR/M decode. */
+            ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
                 sib = insn_fetch_type(uint8_t);
@@ -2359,7 +2360,7 @@ x86_decode(
         }
     }
 
-    if ( override_seg != -1 && ea.type == OP_MEM )
+    if ( override_seg != x86_seg_none )
         ea.mem.seg = override_seg;
 
     /* Fetch the immediate operand, if present. */
@@ -4432,13 +4433,11 @@ x86_emulate(
             generate_exception_if(limit < sizeof(long) ||
                                   (limit & (limit - 1)), EXC_UD);
             base &= ~(limit - 1);
-            if ( override_seg == -1 )
-                override_seg = x86_seg_ds;
             if ( ops->rep_stos )
             {
                 unsigned long nr_reps = limit / sizeof(zero);
 
-                rc = ops->rep_stos(&zero, override_seg, base, sizeof(zero),
+                rc = ops->rep_stos(&zero, ea.mem.seg, base, sizeof(zero),
                                    &nr_reps, ctxt);
                 if ( rc == X86EMUL_OKAY )
                 {
@@ -4450,7 +4449,7 @@ x86_emulate(
             }
             while ( limit )
             {
-                rc = ops->write(override_seg, base, &zero, sizeof(zero), ctxt);
+                rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 base += sizeof(zero);
@@ -5477,7 +5476,6 @@ x86_emulate(
 #undef rex_prefix
 #undef lock_prefix
 #undef vex
-#undef override_seg
 #undef ea
 
 static void __init __maybe_unused build_assertions(void)

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 3/5] x86emul: generalize exception handling for rep_* hooks
  2016-12-06 11:07 [PATCH v3 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
  2016-12-06 11:12 ` [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
  2016-12-06 11:13 ` [PATCH v3 2/5] x86emul: don't assume a memory operand Jan Beulich
@ 2016-12-06 11:14 ` Jan Beulich
  2016-12-06 17:09   ` Andrew Cooper
  2016-12-06 11:15 ` [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
  2016-12-06 11:16 ` [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
  4 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 5327 bytes --]

If any of those hooks returns X86EMUL_EXCEPTION, some register state
should still get updated if some iterations have been performed (but
the rIP update will get suppressed if not all of them did get handled).
This updating is done by register_address_increment() and
__put_rep_prefix() (which hence must no longer be bypassed). As a
result put_rep_prefix() can then skip most of the writeback, but needs
to ensure proper completion of the executed instruction.

While on the HVM side the VA -> LA -> PA translation process ensures
that an exception would be raised on the first iteration only, doing so
would unduly complicate the PV side code about to be added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Broken out from a later patch.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -960,7 +960,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -2896,14 +2900,9 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2915,6 +2914,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2925,14 +2926,9 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2946,6 +2942,8 @@ x86_emulate(
             _regs.esi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3161,15 +3159,10 @@ x86_emulate(
         dst.mem.seg = x86_seg_es;
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         src.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
-        if ( (nr_reps > 1) && (ops->rep_movs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_movs ||
              ((rc = ops->rep_movs(ea.mem.seg, src.mem.off,
                                   dst.mem.seg, dst.mem.off, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, src.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -3184,6 +3177,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3222,13 +3217,14 @@ x86_emulate(
             dst.val = _regs.eax;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
-        else if ( rc != X86EMUL_OKAY )
-            goto done;
         register_address_increment(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 



[-- Attachment #2: x86emul-rep-exception.patch --]
[-- Type: text/plain, Size: 5381 bytes --]

x86emul: generalize exception handling for rep_* hooks

If any of those hooks returns X86EMUL_EXCEPTION, some register state
should still get updated if some iterations have been performed (but
the rIP update will get suppressed if not all of them did get handled).
This updating is done by register_address_increment() and
__put_rep_prefix() (which hence must no longer be bypassed). As a
result put_rep_prefix() can then skip most of the writeback, but needs
to ensure proper completion of the executed instruction.

While on the HVM side the VA -> LA -> PA translation process ensures
that an exception would be raised on the first iteration only, doing so
would unduly complicate the PV side code about to be added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Broken out from a later patch.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -960,7 +960,11 @@ static void __put_rep_prefix(
 
 #define put_rep_prefix(reps_completed) ({                               \
     if ( rep_prefix() )                                                 \
+    {                                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
+        if ( unlikely(rc == X86EMUL_EXCEPTION) )                        \
+            goto no_writeback;                                          \
+    }                                                                   \
 })
 
 /* Clip maximum repetitions so that the index register at most just wraps. */
@@ -2896,14 +2900,9 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_ins != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_ins ||
              ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             fail_if(ops->read_io == NULL);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
@@ -2915,6 +2914,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -2925,14 +2926,9 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps > 1) && (ops->rep_outs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_outs ||
              ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -2946,6 +2942,8 @@ x86_emulate(
             _regs.esi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3161,15 +3159,10 @@ x86_emulate(
         dst.mem.seg = x86_seg_es;
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         src.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
-        if ( (nr_reps > 1) && (ops->rep_movs != NULL) &&
+        if ( (nr_reps == 1) || !ops->rep_movs ||
              ((rc = ops->rep_movs(ea.mem.seg, src.mem.off,
                                   dst.mem.seg, dst.mem.off, dst.bytes,
-                                  &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) )
-        {
-            if ( rc != 0 )
-                goto done;
-        }
-        else
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
         {
             if ( (rc = read_ulong(ea.mem.seg, src.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) != 0 )
@@ -3184,6 +3177,8 @@ x86_emulate(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 
@@ -3222,13 +3217,14 @@ x86_emulate(
             dst.val = _regs.eax;
             dst.type = OP_MEM;
             nr_reps = 1;
+            rc = X86EMUL_OKAY;
         }
-        else if ( rc != X86EMUL_OKAY )
-            goto done;
         register_address_increment(
             _regs.edi,
             nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
         put_rep_prefix(nr_reps);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
         break;
     }
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional
  2016-12-06 11:07 [PATCH v3 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
                   ` (2 preceding siblings ...)
  2016-12-06 11:14 ` [PATCH v3 3/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
@ 2016-12-06 11:15 ` Jan Beulich
  2016-12-06 17:34   ` Andrew Cooper
  2016-12-06 11:16 ` [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
  4 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 5803 bytes --]

While the read and fetch hooks are basically unavoidable, write and
cmpxchg aren't really needed by that many insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1492,6 +1492,8 @@ protmode_load_seg(
     {
         uint32_t new_desc_b = desc.b | a_flag;
 
+        if ( !ops->cmpxchg )
+            return X86EMUL_UNHANDLEABLE;
         switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
                                     &new_desc_b, sizeof(desc.b), ctxt)) )
         {
@@ -2624,13 +2626,18 @@ x86_emulate(
         }
         else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
         {
+            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
             if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) )
                 goto done;
             dst.orig_val = dst.val;
         }
-        else /* Lock prefix is allowed only on RMW instructions. */
+        else
+        {
+            /* Lock prefix is allowed only on RMW instructions. */
             generate_exception_if(lock_prefix, EXC_UD);
+            fail_if(!ops->write);
+        }
         break;
     }
 
@@ -2790,7 +2797,9 @@ x86_emulate(
         unsigned long regs[] = {
             _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
             _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+
         generate_exception_if(mode_64bit(), EXC_UD);
+        fail_if(!ops->write);
         for ( i = 0; i < 8; i++ )
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                                   &regs[i], op_bytes, ctxt)) != 0 )
@@ -3092,7 +3101,7 @@ x86_emulate(
     case 0x9a: /* call (far, absolute) */
         ASSERT(!mode_64bit());
     far_call:
-        fail_if(ops->read_segment == NULL);
+        fail_if(!ops->read_segment || !ops->write);
 
         if ( (rc = ops->read_segment(x86_seg_cs, &sreg, ctxt)) ||
              (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
@@ -3334,6 +3343,7 @@ x86_emulate(
         uint8_t depth = imm2 & 31;
         int i;
 
+        fail_if(!ops->write);
         dst.type = OP_REG;
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.ebp;
@@ -4443,6 +4453,7 @@ x86_emulate(
                 else if ( rc != X86EMUL_UNHANDLEABLE )
                     goto done;
             }
+            fail_if(limit && !ops->write);
             while ( limit )
             {
                 rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
@@ -4463,7 +4474,7 @@ x86_emulate(
         case 1: /* sidt */
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            fail_if(ops->read_segment == NULL);
+            fail_if(!ops->read_segment || !ops->write);
             if ( (rc = ops->read_segment(seg, &sreg, ctxt)) )
                 goto done;
             if ( mode_64bit() )
@@ -4500,12 +4511,18 @@ x86_emulate(
             break;
         case 4: /* smsw */
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes;
+            if ( ea.type == OP_MEM )
+            {
+                fail_if(!ops->write);
+                d |= Mov; /* force writeback */
+                ea.bytes = 2;
+            }
+            else
+                ea.bytes = op_bytes;
             dst = ea;
             fail_if(ops->read_cr == NULL);
             if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
                 goto done;
-            d |= Mov; /* force writeback */
             break;
         case 6: /* lmsw */
             fail_if(ops->read_cr == NULL);
@@ -4707,6 +4724,8 @@ x86_emulate(
             if ( !(b & 1) )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write);
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
@@ -4993,6 +5012,8 @@ x86_emulate(
             if ( b == 0x6f )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write);
         }
         if ( ea.type == OP_MEM || b == 0x7e )
         {
@@ -5291,6 +5312,7 @@ x86_emulate(
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        fail_if(!ops->cmpxchg);
         if ( op_bytes == 8 )
             host_and_vcpu_must_have(cx16);
         op_bytes *= 2;
@@ -5424,12 +5446,18 @@ x86_emulate(
              !ctxt->force_writeback )
             /* nothing to do */;
         else if ( lock_prefix )
+        {
+            fail_if(!ops->cmpxchg);
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
                 &dst.val, dst.bytes, ctxt);
+        }
         else
+        {
+            fail_if(!ops->write);
             rc = ops->write(
                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+        }
         if ( rc != 0 )
             goto done;
     default:
@@ -5513,8 +5541,6 @@ x86_decode_insn(
     const struct x86_emulate_ops ops = {
         .insn_fetch = insn_fetch,
         .read       = x86emul_unhandleable_rw,
-        .write      = PTR_POISON,
-        .cmpxchg    = PTR_POISON,
     };
     int rc = x86_decode(state, ctxt, &ops);
 



[-- Attachment #2: x86emul-write-cmpxchg-optional.patch --]
[-- Type: text/plain, Size: 5849 bytes --]

x86emul: make write and cmpxchg hooks optional

While the read and fetch hooks are basically unavoidable, write and
cmpxchg aren't really needed by that many insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1492,6 +1492,8 @@ protmode_load_seg(
     {
         uint32_t new_desc_b = desc.b | a_flag;
 
+        if ( !ops->cmpxchg )
+            return X86EMUL_UNHANDLEABLE;
         switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
                                     &new_desc_b, sizeof(desc.b), ctxt)) )
         {
@@ -2624,13 +2626,18 @@ x86_emulate(
         }
         else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
         {
+            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
             if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
                                   &dst.val, dst.bytes, ctxt, ops)) )
                 goto done;
             dst.orig_val = dst.val;
         }
-        else /* Lock prefix is allowed only on RMW instructions. */
+        else
+        {
+            /* Lock prefix is allowed only on RMW instructions. */
             generate_exception_if(lock_prefix, EXC_UD);
+            fail_if(!ops->write);
+        }
         break;
     }
 
@@ -2790,7 +2797,9 @@ x86_emulate(
         unsigned long regs[] = {
             _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
             _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+
         generate_exception_if(mode_64bit(), EXC_UD);
+        fail_if(!ops->write);
         for ( i = 0; i < 8; i++ )
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
                                   &regs[i], op_bytes, ctxt)) != 0 )
@@ -3092,7 +3101,7 @@ x86_emulate(
     case 0x9a: /* call (far, absolute) */
         ASSERT(!mode_64bit());
     far_call:
-        fail_if(ops->read_segment == NULL);
+        fail_if(!ops->read_segment || !ops->write);
 
         if ( (rc = ops->read_segment(x86_seg_cs, &sreg, ctxt)) ||
              (rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
@@ -3334,6 +3343,7 @@ x86_emulate(
         uint8_t depth = imm2 & 31;
         int i;
 
+        fail_if(!ops->write);
         dst.type = OP_REG;
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.ebp;
@@ -4443,6 +4453,7 @@ x86_emulate(
                 else if ( rc != X86EMUL_UNHANDLEABLE )
                     goto done;
             }
+            fail_if(limit && !ops->write);
             while ( limit )
             {
                 rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
@@ -4463,7 +4474,7 @@ x86_emulate(
         case 1: /* sidt */
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            fail_if(ops->read_segment == NULL);
+            fail_if(!ops->read_segment || !ops->write);
             if ( (rc = ops->read_segment(seg, &sreg, ctxt)) )
                 goto done;
             if ( mode_64bit() )
@@ -4500,12 +4511,18 @@ x86_emulate(
             break;
         case 4: /* smsw */
             generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
-            ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes;
+            if ( ea.type == OP_MEM )
+            {
+                fail_if(!ops->write);
+                d |= Mov; /* force writeback */
+                ea.bytes = 2;
+            }
+            else
+                ea.bytes = op_bytes;
             dst = ea;
             fail_if(ops->read_cr == NULL);
             if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
                 goto done;
-            d |= Mov; /* force writeback */
             break;
         case 6: /* lmsw */
             fail_if(ops->read_cr == NULL);
@@ -4707,6 +4724,8 @@ x86_emulate(
             if ( !(b & 1) )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write);
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
@@ -4993,6 +5012,8 @@ x86_emulate(
             if ( b == 0x6f )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
+            else
+                fail_if(!ops->write);
         }
         if ( ea.type == OP_MEM || b == 0x7e )
         {
@@ -5291,6 +5312,7 @@ x86_emulate(
 
         generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        fail_if(!ops->cmpxchg);
         if ( op_bytes == 8 )
             host_and_vcpu_must_have(cx16);
         op_bytes *= 2;
@@ -5424,12 +5446,18 @@ x86_emulate(
              !ctxt->force_writeback )
             /* nothing to do */;
         else if ( lock_prefix )
+        {
+            fail_if(!ops->cmpxchg);
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
                 &dst.val, dst.bytes, ctxt);
+        }
         else
+        {
+            fail_if(!ops->write);
             rc = ops->write(
                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+        }
         if ( rc != 0 )
             goto done;
     default:
@@ -5513,8 +5541,6 @@ x86_decode_insn(
     const struct x86_emulate_ops ops = {
         .insn_fetch = insn_fetch,
         .read       = x86emul_unhandleable_rw,
-        .write      = PTR_POISON,
-        .cmpxchg    = PTR_POISON,
     };
     int rc = x86_decode(state, ctxt, &ops);
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-06 11:07 [PATCH v3 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
                   ` (3 preceding siblings ...)
  2016-12-06 11:15 ` [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
@ 2016-12-06 11:16 ` Jan Beulich
  2016-12-06 11:21   ` Jan Beulich
  2016-12-06 19:14   ` Andrew Cooper
  4 siblings, 2 replies; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Paul Durrant

[-- Attachment #1: Type: text/plain, Size: 48069 bytes --]

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment).

The other small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Re-base. Do away with the special case pointer checks on the ->read
    and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
    for 32-bit guests (to avoid the emulator's in_longmode() returning
    a wrong result). Introduce and use ->validate() hook. Make
    formatting more consistent.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         x86_emul_pagefault(pfec, addr, &hvmemul_ctxt->ctxt);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 x86_emul_pagefault(pfec, addr & PAGE_MASK, &hvmemul_ctxt->ctxt);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -572,6 +573,7 @@ static int hvmemul_virtual_to_linear(
      * neither know the exact error code to be used, nor can we easily
      * determine the kind of exception (#GP or #TS) in that case.
      */
+    *reps = 0;
     if ( is_x86_user_segment(seg) )
         x86_emul_hw_exception((seg == x86_seg_ss)
                               ? TRAP_stack_error
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
     pv_inject_event(&event);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1336,7 +1333,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1378,7 +1375,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -2023,6 +2020,148 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static bool priv_op_to_linear(unsigned long base, unsigned long offset,
+                              unsigned int bytes, unsigned long limit,
+                              enum x86_segment seg,
+                              struct x86_emulate_ctxt *ctxt,
+                              unsigned long *addr)
+{
+    bool okay;
+
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        okay = limit >= bytes - 1 && offset <= limit - bytes + 1;
+        *addr = (uint32_t)*addr;
+    }
+    else
+        okay = __addr_ok(*addr);
+
+    if ( unlikely(!okay) )
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+
+    return okay;
+}
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+                              unsigned long offset,
+                              void *p_data,
+                              unsigned int bytes,
+                              struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                            x86_seg_cs, ctxt, &addr) )
+        return X86EMUL_EXCEPTION;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
+                           addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default: return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.l   = 1;
+        reg->attr.fields.db  = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2269,6 +2408,236 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                x86_seg_es, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(PFEC_write_access,
+                               addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                seg, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2409,6 +2778,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2436,6 +2806,28 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior, defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_EFER:
+        *val = read_efer();
+        if ( is_pv_32bit_domain(currd) )
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2539,7 +2931,6 @@ static int priv_op_read_msr(unsigned int
         if ( rc )
             return X86EMUL_OKAY;
         /* fall through */
-    case MSR_EFER:
     normal:
         /* Everyone can read the MSR space. */
         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -2761,11 +3152,41 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
+
+    return X86EMUL_OKAY;
+}
+
 int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
                   unsigned int *edx, struct x86_emulate_ctxt *ctxt)
 {
     struct cpu_user_regs regs = *ctxt->regs;
 
+    /*
+     * x86_emulate uses this function to query CPU features for its own
+     * internal use. Make sure we're actually emulating CPUID before checking
+     * for emulated CPUID faulting.
+     */
+    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
+    {
+        const struct vcpu *curr = current;
+
+        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
+        if ( curr->arch.cpuid_faulting &&
+             !guest_kernel_mode(curr, ctxt->regs) )
+            return X86EMUL_UNHANDLEABLE;
+    }
+
     regs._eax = *eax;
     regs._ecx = *ecx;
 
@@ -2779,497 +3200,153 @@ int pv_emul_cpuid(unsigned int *eax, uns
     return X86EMUL_OKAY;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        pv_inject_page_fault(0, _ptr + sizeof(_x) - _rc);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_validate(const struct x86_emulate_state *state,
+                            struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
-
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
+    switch ( ctxt->opcode )
     {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
-
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
-
-        port = (u16)regs->edx;
-
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(PFEC_write_access,
-                                     data_base + rd_ad(edi) + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
+    case 0x6c ... 0x6f: /* ins / outs */
+    case 0xe4 ... 0xe7: /* in / out (immediate port) */
+    case 0xec ... 0xef: /* in / out (port in %dx) */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x20) ...
+         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+        return X86EMUL_OKAY;
 
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(0, data_base + rd_ad(esi)
-                                     + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
+    case 0xfa: case 0xfb: /* cli / sti */
+        if ( !iopl_ok(current, ctxt->regs) )
             break;
-        }
-
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
-
-        goto done;
-    }
-
-    /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
-     */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
-
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
-    {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
-    }
+        return X86EMUL_DONE;
 
-    /* No decode of this single-byte opcode. */
-    goto fail;
+    case X86EMUL_OPC(0x0f, 0x01):
+    {
+        unsigned int modrm_rm, modrm_reg;
 
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
+        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+             (modrm_rm & 7) != 1 )
             break;
-        case 0xd1: /* XSETBV */
+        switch ( modrm_reg & 7 )
         {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+        case 2: /* xgetbv */
+        case 7: /* rdtscp */
+            return X86EMUL_OKAY;
+        }
+        break;
+    }
+    }
 
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
-            {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
+    return X86EMUL_UNHANDLEABLE;
+}
 
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .validate            = priv_op_validate,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = pv_emul_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+};
 
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
+    eflags = regs->_eflags;
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
+    /*
+     * Un-mirror virtualized state from EFLAGS.
+     * Nothing we allow to be emulated can change TF, IF, or IOPL.
+     */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IF | X86_EFLAGS_IOPL)));
+    regs->_eflags |= X86_EFLAGS_IF;
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+
+    /* More strict than x86_emulate_wrapper(). */
+    ASSERT(ctxt.ctxt.event_pending == (rc == X86EMUL_EXCEPTION));
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
+            {
+                uint64_t val = rdtsc();
 
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.ctxt.retire.singlestep )
+            ctxt.bpmatch |= DR_STEP;
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
-        if ( v->arch.cpuid_faulting && !guest_kernel_mode(v, regs) )
-            goto fail;
-
-        pv_cpuid(regs);
-        break;
+        /* fall through */
+    case X86EMUL_RETRY:
+        return EXCRET_fault_fixed;
 
-    default:
-        goto fail;
+    case X86EMUL_EXCEPTION:
+        pv_inject_event(&ctxt.ctxt.event);
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3599,7 +3677,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1182,7 +1182,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
 
     /* Ensure the TSS has an io-bitmap-offset field. */
     generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
@@ -2469,6 +2469,21 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
+    if ( ops->validate )
+    {
+#ifndef NDEBUG
+        state.caller = __builtin_return_address(0);
+#endif
+        rc = ops->validate(&state, ctxt);
+#ifndef NDEBUG
+        state.caller = NULL;
+#endif
+        if ( rc == X86EMUL_DONE )
+            goto no_writeback;
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
@@ -2909,13 +2924,28 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_ins ||
-             ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_ins )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->read_io && ops->write )
         {
-            fail_if(ops->read_io == NULL);
+            rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_ins )
+            rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
+                              &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            fail_if(!ops->read_io || !ops->write);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
                 goto done;
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             dst.type = OP_MEM;
             nr_reps = 1;
         }
@@ -2935,14 +2965,30 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_outs ||
-             ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_outs )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->write_io )
         {
-            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
-                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
+            rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, dst.bytes,
+                            ctxt, ops);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_outs )
+            rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
+                               &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val,
+                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
                 goto done;
             fail_if(ops->write_io == NULL);
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
                 goto done;
             nr_reps = 1;
@@ -4042,7 +4088,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5464,9 +5514,7 @@ x86_emulate(
         break;
     }
 
- no_writeback:
-    /* Commit shadow register state. */
-    _regs.eflags &= ~EFLG_RF;
+ no_writeback: /* Commit shadow register state. */
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -5476,7 +5524,15 @@ x86_emulate(
     if ( (rc == X86EMUL_OKAY) && (ctxt->regs->eflags & EFLG_TF) )
         ctxt->retire.singlestep = true;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip = _regs.eip;
+        rc = X86EMUL_OKAY;
+    }
+
+    ctxt->regs->eflags &= ~EFLG_RF;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -150,6 +150,14 @@ struct __attribute__((__packed__)) segme
 #define X86EMUL_EXCEPTION      2
  /* Retry the emulation for some reason. No state modified. */
 #define X86EMUL_RETRY          3
+ /*
+  * Operation fully done by one of the hooks:
+  * - validate(): operation completed (except common insn retire logic)
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when used anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -160,6 +168,8 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_ymm  /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
 };
 
+struct x86_emulate_state;
+
 /*
  * These operations represent the instruction emulator's interface to memory,
  * I/O ports, privileged state... pretty much everything other than GPRs.
@@ -239,6 +249,13 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * validate: Post-decode hook to allow caller controlled filtering.
+     */
+    int (*validate)(
+        const struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
      *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
      *  @reps:  [IN ] Maximum repetitions to be emulated.



[-- Attachment #2: x86-PV-priv-op-generic-emul.patch --]
[-- Type: text/plain, Size: 48133 bytes --]

x86/PV: use generic emulator for privileged instruction handling

There's a new emulator return code being added to allow bypassing
certain operations (see the code comment).

The other small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Re-base. Do away with the special case pointer checks on the ->read
    and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
    for 32-bit guests (to avoid the emulator's in_longmode() returning
    a wrong result). Introduce and use ->validate() hook. Make
    formatting more consistent.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
             return X86EMUL_RETRY;
+        *reps = 0;
         x86_emul_pagefault(pfec, addr, &hvmemul_ctxt->ctxt);
         return X86EMUL_EXCEPTION;
     }
@@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
             if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
                 return X86EMUL_RETRY;
             done /= bytes_per_rep;
+            *reps = done;
             if ( done == 0 )
             {
                 ASSERT(!reverse);
@@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
                 x86_emul_pagefault(pfec, addr & PAGE_MASK, &hvmemul_ctxt->ctxt);
                 return X86EMUL_EXCEPTION;
             }
-            *reps = done;
             break;
         }
 
@@ -572,6 +573,7 @@ static int hvmemul_virtual_to_linear(
      * neither know the exact error code to be used, nor can we easily
      * determine the kind of exception (#GP or #TS) in that case.
      */
+    *reps = 0;
     if ( is_x86_user_segment(seg) )
         x86_emul_hw_exception((seg == x86_seg_ss)
                               ? TRAP_stack_error
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
     pv_inject_event(&event);
 }
 
-static void instruction_done(
-    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
 {
     regs->eip = eip;
     regs->eflags &= ~X86_EFLAGS_RF;
-    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+    if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
-        if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= DR_STEP;
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
         do_guest_trap(TRAP_debug, regs);
     }
 }
@@ -1336,7 +1333,7 @@ static int emulate_invalid_rdtscp(struct
         return 0;
     eip += sizeof(opcode);
     pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
     return EXCRET_fault_fixed;
 }
 
@@ -1378,7 +1375,7 @@ static int emulate_forced_invalid_op(str
 
     pv_cpuid(regs);
 
-    instruction_done(regs, eip, 0);
+    instruction_done(regs, eip);
 
     trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
 
@@ -2023,6 +2020,148 @@ static int read_gate_descriptor(unsigned
     return 1;
 }
 
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static bool priv_op_to_linear(unsigned long base, unsigned long offset,
+                              unsigned int bytes, unsigned long limit,
+                              enum x86_segment seg,
+                              struct x86_emulate_ctxt *ctxt,
+                              unsigned long *addr)
+{
+    bool okay;
+
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 8 )
+    {
+        okay = limit >= bytes - 1 && offset <= limit - bytes + 1;
+        *addr = (uint32_t)*addr;
+    }
+    else
+        okay = __addr_ok(*addr);
+
+    if ( unlikely(!okay) )
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+
+    return okay;
+}
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+                              unsigned long offset,
+                              void *p_data,
+                              unsigned int bytes,
+                              struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                            x86_seg_cs, ctxt, &addr) )
+        return X86EMUL_EXCEPTION;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
+                           addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    if ( ctxt->addr_size < 8 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        case x86_seg_tr:
+            /* Check if this is an attempt to access to I/O bitmap. */
+            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+                return X86EMUL_DONE;
+            /* fall through */
+        default: return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.l   = 1;
+        reg->attr.fields.db  = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
 static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
 {
@@ -2269,6 +2408,236 @@ unsigned long guest_to_host_gpr_switch(u
 
 void (*pv_post_outb_hook)(unsigned int port, u8 value);
 
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                x86_seg_es, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(PFEC_write_access,
+                               addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit,
+                                seg, ctxt, &addr) )
+            return X86EMUL_EXCEPTION;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static int priv_op_read_cr(unsigned int reg, unsigned long *val,
                            struct x86_emulate_ctxt *ctxt)
 {
@@ -2409,6 +2778,7 @@ static inline bool is_cpufreq_controller
 static int priv_op_read_msr(unsigned int reg, uint64_t *val,
                             struct x86_emulate_ctxt *ctxt)
 {
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
     const struct vcpu *curr = current;
     const struct domain *currd = curr->domain;
     bool vpmu_msr = false;
@@ -2436,6 +2806,28 @@ static int priv_op_read_msr(unsigned int
         *val = curr->arch.pv_vcpu.gs_base_user;
         return X86EMUL_OKAY;
 
+    /*
+     * In order to fully retain original behavior, defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_EFER:
+        *val = read_efer();
+        if ( is_pv_32bit_domain(currd) )
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+        return X86EMUL_OKAY;
+
     case MSR_K7_FID_VID_CTL:
     case MSR_K7_FID_VID_STATUS:
     case MSR_K8_PSTATE_LIMIT:
@@ -2539,7 +2931,6 @@ static int priv_op_read_msr(unsigned int
         if ( rc )
             return X86EMUL_OKAY;
         /* fall through */
-    case MSR_EFER:
     normal:
         /* Everyone can read the MSR space. */
         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -2761,11 +3152,41 @@ static int priv_op_write_msr(unsigned in
     return X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
+
+    return X86EMUL_OKAY;
+}
+
 int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
                   unsigned int *edx, struct x86_emulate_ctxt *ctxt)
 {
     struct cpu_user_regs regs = *ctxt->regs;
 
+    /*
+     * x86_emulate uses this function to query CPU features for its own
+     * internal use. Make sure we're actually emulating CPUID before checking
+     * for emulated CPUID faulting.
+     */
+    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
+    {
+        const struct vcpu *curr = current;
+
+        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
+        if ( curr->arch.cpuid_faulting &&
+             !guest_kernel_mode(curr, ctxt->regs) )
+            return X86EMUL_UNHANDLEABLE;
+    }
+
     regs._eax = *eax;
     regs._ecx = *ecx;
 
@@ -2779,497 +3200,153 @@ int pv_emul_cpuid(unsigned int *eax, uns
     return X86EMUL_OKAY;
 }
 
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit)                                  \
-({  unsigned long _rc, _ptr = (base) + (eip);                               \
-    type _x;                                                                \
-    if ( ad_default < 8 )                                                   \
-        _ptr = (unsigned int)_ptr;                                          \
-    if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
-        goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
-    {                                                                       \
-        pv_inject_page_fault(0, _ptr + sizeof(_x) - _rc);                   \
-        goto skip;                                                          \
-    }                                                                       \
-    (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_validate(const struct x86_emulate_state *state,
+                            struct x86_emulate_ctxt *ctxt)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    unsigned long *reg, eip = regs->eip;
-    u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
-    enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
-    int rc;
-    unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
-                    ? regs->reg \
-                    : ad_bytes == 4 \
-                      ? (u32)regs->reg \
-                      : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
-                         ? regs->reg = (val) \
-                         : ad_bytes == 4 \
-                           ? (*(u32 *)&regs->reg = (val)) \
-                           : (*(u16 *)&regs->reg = (val)))
-    unsigned long code_base, code_limit;
-    char *io_emul_stub = NULL;
-    void (*io_emul)(struct cpu_user_regs *);
-    uint64_t val;
-
-    if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
-        goto fail;
-    op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
-    ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
-    if ( !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        goto fail;
-
-    /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(ds);
-
-    /* Legacy prefixes. */
-    for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
+    switch ( ctxt->opcode )
     {
-        switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0x66: /* operand-size override */
-            opsize_prefix = 1;
-            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
-            continue;
-        case 0x67: /* address-size override */
-            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
-            continue;
-        case 0x2e: /* CS override */
-            data_sel = regs->cs;
-            continue;
-        case 0x3e: /* DS override */
-            data_sel = read_sreg(ds);
-            continue;
-        case 0x26: /* ES override */
-            data_sel = read_sreg(es);
-            continue;
-        case 0x64: /* FS override */
-            data_sel = read_sreg(fs);
-            lm_ovr = lm_seg_fs;
-            continue;
-        case 0x65: /* GS override */
-            data_sel = read_sreg(gs);
-            lm_ovr = lm_seg_gs;
-            continue;
-        case 0x36: /* SS override */
-            data_sel = regs->ss;
-            continue;
-        case 0xf0: /* LOCK */
-            lock = 1;
-            continue;
-        case 0xf2: /* REPNE/REPNZ */
-        case 0xf3: /* REP/REPE/REPZ */
-            rep_prefix = 1;
-            continue;
-        default:
-            if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
-            {
-                rex = opcode;
-                continue;
-            }
-            break;
-        }
-        break;
-    }
-
-    /* REX prefix. */
-    if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
-    modrm_reg = (rex & 4) << 1;  /* REX.R */
-    /* REX.X does not need to be decoded. */
-    modrm_rm  = (rex & 1) << 3;  /* REX.B */
-
-    if ( opcode == 0x0f )
-        goto twobyte_opcode;
-    
-    if ( lock )
-        goto fail;
-
-    /* Input/Output String instructions. */
-    if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
-    {
-        unsigned long data_base, data_limit;
-
-        if ( rep_prefix && (rd_ad(ecx) == 0) )
-            goto done;
-
-        if ( !(opcode & 2) )
-        {
-            data_sel = read_sreg(es);
-            lm_ovr = lm_seg_none;
-        }
-
-        if ( !(ar & _SEGMENT_L) )
-        {
-            if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
-                                  &ar, 0) )
-                goto fail;
-            if ( !(ar & _SEGMENT_S) ||
-                 !(ar & _SEGMENT_P) ||
-                 (opcode & 2 ?
-                  (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
-                  (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
-                goto fail;
-        }
-        else
-        {
-            switch ( lm_ovr )
-            {
-            default:
-                data_base = 0UL;
-                break;
-            case lm_seg_fs:
-                data_base = rdfsbase();
-                break;
-            case lm_seg_gs:
-                data_base = rdgsbase();
-                break;
-            }
-            data_limit = ~0UL;
-            ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
-        }
-
-        port = (u16)regs->edx;
-
-    continue_io_string:
-        switch ( opcode )
-        {
-        case 0x6c: /* INSB */
-            op_bytes = 1;
-        case 0x6d: /* INSW/INSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            data = guest_io_read(port, op_bytes, currd);
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
-                                    &data, op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(PFEC_write_access,
-                                     data_base + rd_ad(edi) + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
-            break;
+    case 0x6c ... 0x6f: /* ins / outs */
+    case 0xe4 ... 0xe7: /* in / out (immediate port) */
+    case 0xec ... 0xef: /* in / out (port in %dx) */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x20) ...
+         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+        return X86EMUL_OKAY;
 
-        case 0x6e: /* OUTSB */
-            op_bytes = 1;
-        case 0x6f: /* OUTSW/OUTSL */
-            if ( (data_limit < (op_bytes - 1)) ||
-                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
-                  !guest_io_okay(port, op_bytes, v, regs) )
-                goto fail;
-            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
-                                      op_bytes)) != 0 )
-            {
-                pv_inject_page_fault(0, data_base + rd_ad(esi)
-                                     + op_bytes - rc);
-                return EXCRET_fault_fixed;
-            }
-            guest_io_write(port, op_bytes, data, currd);
-            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
-                                         ? -op_bytes : op_bytes));
+    case 0xfa: case 0xfb: /* cli / sti */
+        if ( !iopl_ok(current, ctxt->regs) )
             break;
-        }
-
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-
-        if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
-        {
-            if ( !bpmatch && !hypercall_preempt_check() )
-                goto continue_io_string;
-            eip = regs->eip;
-        }
-
-        goto done;
-    }
-
-    /*
-     * Very likely to be an I/O instruction (IN/OUT).
-     * Build an stub to execute the instruction with full guest GPR
-     * context. This is needed for some systems which (ab)use IN/OUT
-     * to communicate with BIOS code in system-management mode.
-     */
-    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
-                   STUB_BUF_SIZE / 2;
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    io_emul_stub[0] = 0x48;
-    io_emul_stub[1] = 0xb9;
-    *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    io_emul_stub[10] = 0xff;
-    io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    io_emul_stub[14] = 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
-    /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
-
-    /* I/O Port and Interrupt Flag instructions. */
-    switch ( opcode )
-    {
-    case 0xe4: /* IN imm8,%al */
-        op_bytes = 1;
-    case 0xe5: /* IN imm8,%eax */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_in:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-        }
-        else
-        {
-            if ( op_bytes == 4 )
-                regs->eax = 0;
-            else
-                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xec: /* IN %dx,%al */
-        op_bytes = 1;
-    case 0xed: /* IN %dx,%eax */
-        port = (u16)regs->edx;
-        goto exec_in;
-
-    case 0xe6: /* OUT %al,imm8 */
-        op_bytes = 1;
-    case 0xe7: /* OUT %eax,imm8 */
-        port = insn_fetch(u8, code_base, eip, code_limit);
-        io_emul_stub[14] = port; /* imm8 */
-    exec_out:
-        if ( !guest_io_okay(port, op_bytes, v, regs) )
-            goto fail;
-        if ( admin_io_okay(port, op_bytes, currd) )
-        {
-            mark_regs_dirty(regs);
-            io_emul(regs);            
-            if ( (op_bytes == 1) && pv_post_outb_hook )
-                pv_post_outb_hook(port, regs->eax);
-        }
-        else
-        {
-            guest_io_write(port, op_bytes, regs->eax, currd);
-        }
-        bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-        goto done;
-
-    case 0xee: /* OUT %al,%dx */
-        op_bytes = 1;
-    case 0xef: /* OUT %eax,%dx */
-        port = (u16)regs->edx;
-        goto exec_out;
-
-    case 0xfa: /* CLI */
-    case 0xfb: /* STI */
-        if ( !iopl_ok(v, regs) )
-            goto fail;
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
          * caller then tries to reenable interrupts using POPF: we can't trap
          * that and we'll end up with hard-to-debug lockups. Fast & loose will
          * do for us. :-)
+        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
          */
-        /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
-        goto done;
-    }
+        return X86EMUL_DONE;
 
-    /* No decode of this single-byte opcode. */
-    goto fail;
+    case X86EMUL_OPC(0x0f, 0x01):
+    {
+        unsigned int modrm_rm, modrm_reg;
 
- twobyte_opcode:
-    /*
-     * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
-     * and CPUID (0xa2), are executable only from guest kernel mode 
-     * (virtual ring 0).
-     */
-    opcode = insn_fetch(u8, code_base, eip, code_limit);
-    if ( !guest_kernel_mode(v, regs) && 
-        (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
-        goto fail;
-
-    if ( lock && (opcode & ~3) != 0x20 )
-        goto fail;
-    switch ( opcode )
-    {
-    case 0x1: /* RDTSCP and XSETBV */
-        switch ( insn_fetch(u8, code_base, eip, code_limit) )
-        {
-        case 0xf9: /* RDTSCP */
-            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-                 !guest_kernel_mode(v, regs) )
-                goto fail;
-            pv_soft_rdtsc(v, regs, 1);
+        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+             (modrm_rm & 7) != 1 )
             break;
-        case 0xd1: /* XSETBV */
+        switch ( modrm_reg & 7 )
         {
-            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+        case 2: /* xgetbv */
+        case 7: /* rdtscp */
+            return X86EMUL_OKAY;
+        }
+        break;
+    }
+    }
 
-            if ( lock || rep_prefix || opsize_prefix
-                 || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
-            {
-                do_guest_trap(TRAP_invalid_op, regs);
-                goto skip;
-            }
+    return X86EMUL_UNHANDLEABLE;
+}
 
-            if ( !guest_kernel_mode(v, regs) )
-                goto fail;
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .validate            = priv_op_validate,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = pv_emul_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+};
 
-            if ( handle_xsetbv(regs->ecx, new_xfeature) )
-                goto fail;
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+    int rc;
+    unsigned int eflags, ar;
 
-            break;
-        }
-        default:
-            goto fail;
-        }
-        break;
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
 
-    case 0x06: /* CLTS */
-        (void)do_fpu_taskswitch(0);
-        break;
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->_eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->_eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->_eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+    regs->_eflags |= curr->arch.pv_vcpu.iopl;
+    eflags = regs->_eflags;
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
 
-    case 0x09: /* WBINVD */
-        /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(currd) )
-            /* Non-physdev domain attempted WBINVD; ignore for now since
-               newer linux uses this in some start-of-day timing loops */
-            ;
-        else
-            wbinvd();
-        break;
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
 
-    case 0x20: /* MOV CR?,<reg> */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x21: /* MOV DR?,<reg> */ {
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
-                             NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-    }
+    /*
+     * Un-mirror virtualized state from EFLAGS.
+     * Nothing we allow to be emulated can change TF, IF, or IOPL.
+     */
+    ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IF | X86_EFLAGS_IOPL)));
+    regs->_eflags |= X86_EFLAGS_IF;
+    regs->_eflags &= ~X86_EFLAGS_IOPL;
+
+    /* More strict than x86_emulate_wrapper(). */
+    ASSERT(ctxt.ctxt.event_pending == (rc == X86EMUL_EXCEPTION));
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
+            {
+                uint64_t val = rdtsc();
 
-    case 0x22: /* MOV <reg>,CR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
-        {
-        case X86EMUL_OKAY:
-            break;
-        case X86EMUL_RETRY: /* retry after preemption */
-            goto skip;
-        default:
-            goto fail;
+                regs->eax = (uint32_t)val;
+                regs->edx = (uint32_t)(val >> 32);
+            }
         }
-        break;
-
-    case 0x23: /* MOV <reg>,DR? */
-        opcode = insn_fetch(u8, code_base, eip, code_limit);
-        if ( opcode < 0xc0 )
-            goto fail;
-        modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
-        modrm_rm  |= (opcode >> 0) & 7;
-        reg = decode_register(modrm_rm, regs, 0);
-        if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
-
-    case 0x30: /* WRMSR */
-        if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
-                               NULL) != X86EMUL_OKAY )
-            goto fail;
-        break;
 
-    case 0x31: /* RDTSC */
-        if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( currd->arch.vtsc )
-            pv_soft_rdtsc(v, regs, 0);
-        else
+        if ( ctxt.ctxt.retire.singlestep )
+            ctxt.bpmatch |= DR_STEP;
+        if ( ctxt.bpmatch )
         {
-            val = rdtsc();
-            goto rdmsr_writeback;
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                do_guest_trap(TRAP_debug, regs);
         }
-        break;
-
-    case 0x32: /* RDMSR */
-        if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
-            goto fail;
- rdmsr_writeback:
-        regs->eax = (uint32_t)val;
-        regs->edx = (uint32_t)(val >> 32);
-        break;
-
-    case 0xa2: /* CPUID */
-        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
-        if ( v->arch.cpuid_faulting && !guest_kernel_mode(v, regs) )
-            goto fail;
-
-        pv_cpuid(regs);
-        break;
+        /* fall through */
+    case X86EMUL_RETRY:
+        return EXCRET_fault_fixed;
 
-    default:
-        goto fail;
+    case X86EMUL_EXCEPTION:
+        pv_inject_event(&ctxt.ctxt.event);
+        return EXCRET_fault_fixed;
     }
 
-#undef wr_ad
-#undef rd_ad
-
- done:
-    instruction_done(regs, eip, bpmatch);
- skip:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
-    return EXCRET_fault_fixed;
-
- fail:
-    if ( io_emul_stub )
-        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3599,7 +3677,7 @@ static void emulate_gate_op(struct cpu_u
         sel |= (regs->cs & 3);
 
     regs->cs = sel;
-    instruction_done(regs, off, 0);
+    instruction_done(regs, off);
 }
 
 void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1182,7 +1182,7 @@ static int ioport_access_check(
 
     fail_if(ops->read_segment == NULL);
     if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
-        return rc;
+        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
 
     /* Ensure the TSS has an io-bitmap-offset field. */
     generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
@@ -2469,6 +2469,21 @@ x86_emulate(
     /* Sync rIP to post decode value. */
     _regs.eip = state.eip;
 
+    if ( ops->validate )
+    {
+#ifndef NDEBUG
+        state.caller = __builtin_return_address(0);
+#endif
+        rc = ops->validate(&state, ctxt);
+#ifndef NDEBUG
+        state.caller = NULL;
+#endif
+        if ( rc == X86EMUL_DONE )
+            goto no_writeback;
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
@@ -2909,13 +2924,28 @@ x86_emulate(
         dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_ins ||
-             ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
-                                 &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_ins )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->read_io && ops->write )
         {
-            fail_if(ops->read_io == NULL);
+            rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_ins )
+            rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
+                              &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            fail_if(!ops->read_io || !ops->write);
             if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
                 goto done;
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             dst.type = OP_MEM;
             nr_reps = 1;
         }
@@ -2935,14 +2965,30 @@ x86_emulate(
         ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
         if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
             goto done;
-        if ( (nr_reps == 1) || !ops->rep_outs ||
-             ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
-                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        /* Try the presumably most efficient approach first. */
+        if ( !ops->rep_outs )
+            nr_reps = 1;
+        rc = X86EMUL_UNHANDLEABLE;
+        if ( nr_reps == 1 && ops->write_io )
         {
-            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
-                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
+            rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, dst.bytes,
+                            ctxt, ops);
+            if ( rc == X86EMUL_OKAY )
+                nr_reps = 0;
+        }
+        if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_outs )
+            rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
+                               &nr_reps, ctxt);
+        if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+        {
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val,
+                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
                 goto done;
             fail_if(ops->write_io == NULL);
+            nr_reps = 0;
+        }
+        if ( !nr_reps && rc == X86EMUL_OKAY )
+        {
             if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
                 goto done;
             nr_reps = 1;
@@ -4042,7 +4088,11 @@ x86_emulate(
             rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
         }
         if ( rc != 0 )
+        {
+            if ( rc == X86EMUL_DONE )
+                goto no_writeback;
             goto done;
+        }
         break;
     }
 
@@ -5464,9 +5514,7 @@ x86_emulate(
         break;
     }
 
- no_writeback:
-    /* Commit shadow register state. */
-    _regs.eflags &= ~EFLG_RF;
+ no_writeback: /* Commit shadow register state. */
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -5476,7 +5524,15 @@ x86_emulate(
     if ( (rc == X86EMUL_OKAY) && (ctxt->regs->eflags & EFLG_TF) )
         ctxt->retire.singlestep = true;
 
-    *ctxt->regs = _regs;
+    if ( rc != X86EMUL_DONE )
+        *ctxt->regs = _regs;
+    else
+    {
+        ctxt->regs->eip = _regs.eip;
+        rc = X86EMUL_OKAY;
+    }
+
+    ctxt->regs->eflags &= ~EFLG_RF;
 
  done:
     _put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -150,6 +150,14 @@ struct __attribute__((__packed__)) segme
 #define X86EMUL_EXCEPTION      2
  /* Retry the emulation for some reason. No state modified. */
 #define X86EMUL_RETRY          3
+ /*
+  * Operation fully done by one of the hooks:
+  * - validate(): operation completed (except common insn retire logic)
+  * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+  * - read_io() / write_io(): bypass GPR update (non-string insns only)
+  * Undefined behavior when used anywhere else.
+  */
+#define X86EMUL_DONE           4
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -160,6 +168,8 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_ymm  /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
 };
 
+struct x86_emulate_state;
+
 /*
  * These operations represent the instruction emulator's interface to memory,
  * I/O ports, privileged state... pretty much everything other than GPRs.
@@ -239,6 +249,13 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * validate: Post-decode hook to allow caller controlled filtering.
+     */
+    int (*validate)(
+        const struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
      *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
      *  @reps:  [IN ] Maximum repetitions to be emulated.

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-06 11:16 ` [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
@ 2016-12-06 11:21   ` Jan Beulich
  2016-12-06 19:14   ` Andrew Cooper
  1 sibling, 0 replies; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 11:21 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Paul Durrant

>>> On 06.12.16 at 12:16, <JBeulich@suse.com> wrote:
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys(
>      {
>          if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
>              return X86EMUL_RETRY;
> +        *reps = 0;
>          x86_emul_pagefault(pfec, addr, &hvmemul_ctxt->ctxt);
>          return X86EMUL_EXCEPTION;
>      }
> @@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys(
>              if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
>                  return X86EMUL_RETRY;
>              done /= bytes_per_rep;
> +            *reps = done;
>              if ( done == 0 )
>              {
>                  ASSERT(!reverse);
> @@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys(
>                  x86_emul_pagefault(pfec, addr & PAGE_MASK, &hvmemul_ctxt->ctxt);
>                  return X86EMUL_EXCEPTION;
>              }
> -            *reps = done;
>              break;
>          }
>  
> @@ -572,6 +573,7 @@ static int hvmemul_virtual_to_linear(
>       * neither know the exact error code to be used, nor can we easily
>       * determine the kind of exception (#GP or #TS) in that case.
>       */
> +    *reps = 0;
>      if ( is_x86_user_segment(seg) )
>          x86_emul_hw_exception((seg == x86_seg_ss)
>                                ? TRAP_stack_error

I've only now noticed that these changes all belong into patch 3,
where I've moved them now.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-06 11:12 ` [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
@ 2016-12-06 11:56   ` Andrew Cooper
  2016-12-06 13:17     ` Jan Beulich
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-06 11:56 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 06/12/16 11:12, Jan Beulich wrote:
> ... instead of custom handling. Note that we can't use generic
> emulation, as the emulator's far branch support is rather rudimentary
> at this point in time.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -28,6 +28,7 @@
>  #include <xen/init.h>
>  #include <xen/sched.h>
>  #include <xen/lib.h>
> +#include <xen/err.h>
>  #include <xen/errno.h>
>  #include <xen/mm.h>
>  #include <xen/console.h>
> @@ -3313,13 +3314,92 @@ static inline int check_stack_limit(unsi
>              (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
>  }
>  
> +struct gate_op_ctxt {
> +    struct x86_emulate_ctxt ctxt;
> +    struct {
> +        unsigned long base, limit;
> +    } cs;
> +    bool insn_fetch;
> +};
> +
> +static int gate_op_read(
> +    enum x86_segment seg,
> +    unsigned long offset,
> +    void *p_data,
> +    unsigned int bytes,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    const struct gate_op_ctxt *goc =
> +        container_of(ctxt, struct gate_op_ctxt, ctxt);
> +    unsigned int rc = bytes, sel = 0;
> +    unsigned long addr = offset, limit = 0;
> +
> +    switch ( seg )
> +    {
> +    case x86_seg_cs:
> +        addr += goc->cs.base;
> +        limit = goc->cs.limit;
> +        break;
> +    case x86_seg_ds:
> +        sel = read_sreg(ds);
> +        break;
> +    case x86_seg_es:
> +        sel = read_sreg(es);
> +        break;
> +    case x86_seg_fs:
> +        sel = read_sreg(fs);
> +        break;
> +    case x86_seg_gs:
> +        sel = read_sreg(gs);
> +        break;
> +    case x86_seg_ss:
> +        sel = ctxt->regs->ss;
> +        break;
> +    default:
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +    if ( sel )
> +    {
> +        unsigned int ar;
> +
> +        ASSERT(!goc->insn_fetch);
> +        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
> +             !(ar & _SEGMENT_S) ||
> +             !(ar & _SEGMENT_P) ||
> +             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
> +            return X86EMUL_UNHANDLEABLE;
> +        addr += offset;
> +    }
> +    else if ( seg != x86_seg_cs )
> +        return X86EMUL_UNHANDLEABLE;
> +
> +    if ( limit < bytes - 1 || offset > limit - bytes + 1 )

Is this correct for the zero-length instruction fetches which the
emulator emits?  It would be better to make it safe now, than to
introduce a latent bug in the future.

> +        return X86EMUL_UNHANDLEABLE;
> +
> +    if ( is_pv_32bit_vcpu(current) )
> +        addr = (uint32_t)addr;

This should be based on %cs attributes.  What about a 64bit PV guest in
a compatability segment?

> +
> +    if ( !__addr_ok(addr) ||
> +         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
> +    {
> +        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
> +                           ? PFEC_insn_fetch : 0,
> +                           addr + bytes - rc, ctxt);

Independently to the point below, the correct predicate for setting
PFEC_insn_fetch is ((EFER & EFER_NX) || (CR4 & SMEP))

However, this is subtly wrong, but I can't think of a sensible way of
fixing it (i.e. without doing independent pagewalks).

__copy_from_user() does a data fetch, not an instruction fetch.

With the advent of PKU, processors support pages which can't be read,
but can be executed.  Then again, our chances of ever supporting PKU for
PV guests is slim, so maybe this point isn't very important.

As we actually do a data read, the error code should remain 0.  This
(getting a data fetch failure for something which should have been an
instruction fetch) will be less confusing for the guest than claiming an
instruction fetch failure when we actually did a data fetch, as at least
the error code will be correct for the access rights in the translation.

> +        return X86EMUL_EXCEPTION;
> +    }
> +
> +    return X86EMUL_OKAY;
> +}
> +
>  static void emulate_gate_op(struct cpu_user_regs *regs)
>  {
>      struct vcpu *v = current;
> -    unsigned int sel, ar, dpl, nparm, opnd_sel;
> -    unsigned int op_default, op_bytes, ad_default, ad_bytes;
> -    unsigned long off, eip, opnd_off, base, limit;
> -    int jump;
> +    unsigned int sel, ar, dpl, nparm, insn_len;
> +    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
> +    struct x86_emulate_state *state;
> +    unsigned long off, base, limit;
> +    uint16_t opnd_sel = 0;
> +    int jump = -1, rc = X86EMUL_OKAY;
>  
>      /* Check whether this fault is due to the use of a call gate. */
>      if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
> @@ -3341,7 +3421,8 @@ static void emulate_gate_op(struct cpu_u
>       * Decode instruction (and perhaps operand) to determine RPL,
>       * whether this is a jump or a call, and the call return offset.
>       */
> -    if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
> +    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
> +                          &ar, 0) ||
>           !(ar & _SEGMENT_S) ||
>           !(ar & _SEGMENT_P) ||
>           !(ar & _SEGMENT_CODE) )
> @@ -3350,179 +3431,59 @@ static void emulate_gate_op(struct cpu_u
>          return;
>      }
>  
> -    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
> -    ad_default = ad_bytes = op_default;
> -    opnd_sel = opnd_off = 0;
> -    jump = -1;
> -    for ( eip = regs->eip; eip - regs->_eip < 10; )
> +    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
> +    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
> +    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
> +    ctxt.insn_fetch = false;
> +    if ( IS_ERR_OR_NULL(state) )
> +    {
> +        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
> +            do_guest_trap(TRAP_gp_fault, regs);

Here, and...

> +        return;
> +    }
> +
> +    switch ( ctxt.ctxt.opcode )
>      {
> -        switch ( insn_fetch(u8, base, eip, limit) )
> +        unsigned int modrm_345;
> +
> +    case 0xea:
> +        ++jump;
> +        /* fall through */
> +    case 0x9a:
> +        ++jump;
> +        opnd_sel = x86_insn_immediate(state, 1);
> +        break;
> +    case 0xff:
> +        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
> +            break;
> +        switch ( modrm_345 & 7 )
>          {
> -        case 0x66: /* operand-size override */
> -            op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
> -            continue;
> -        case 0x67: /* address-size override */
> -            ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
> -            continue;
> -        case 0x2e: /* CS override */
> -            opnd_sel = regs->cs;
> -            ASSERT(opnd_sel);
> -            continue;
> -        case 0x3e: /* DS override */
> -            opnd_sel = read_sreg(ds);
> -            if ( !opnd_sel )
> -                opnd_sel = dpl;
> -            continue;
> -        case 0x26: /* ES override */
> -            opnd_sel = read_sreg(es);
> -            if ( !opnd_sel )
> -                opnd_sel = dpl;
> -            continue;
> -        case 0x64: /* FS override */
> -            opnd_sel = read_sreg(fs);
> -            if ( !opnd_sel )
> -                opnd_sel = dpl;
> -            continue;
> -        case 0x65: /* GS override */
> -            opnd_sel = read_sreg(gs);
> -            if ( !opnd_sel )
> -                opnd_sel = dpl;
> -            continue;
> -        case 0x36: /* SS override */
> -            opnd_sel = regs->ss;
> -            if ( !opnd_sel )
> -                opnd_sel = dpl;
> -            continue;
> -        case 0xea:
> +            enum x86_segment seg;
> +
> +        case 5:
>              ++jump;
> -            /* FALLTHROUGH */
> -        case 0x9a:
> +            /* fall through */
> +        case 3:
>              ++jump;
> -            opnd_sel = regs->cs;
> -            opnd_off = eip;
> -            ad_bytes = ad_default;
> -            eip += op_bytes + 2;
> -            break;
> -        case 0xff:
> -            {
> -                unsigned int modrm;
> -
> -                switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
> -                {
> -                case 0x28: case 0x68: case 0xa8:
> -                    ++jump;
> -                    /* FALLTHROUGH */
> -                case 0x18: case 0x58: case 0x98:
> -                    ++jump;
> -                    if ( ad_bytes != 2 )
> -                    {
> -                        if ( (modrm & 7) == 4 )
> -                        {
> -                            unsigned int sib;
> -                            sib = insn_fetch(u8, base, eip, limit);
> -
> -                            modrm = (modrm & ~7) | (sib & 7);
> -                            if ( ((sib >>= 3) & 7) != 4 )
> -                                opnd_off = *(unsigned long *)
> -                                    decode_register(sib & 7, regs, 0);
> -                            opnd_off <<= sib >> 3;
> -                        }
> -                        if ( (modrm & 7) != 5 || (modrm & 0xc0) )
> -                            opnd_off += *(unsigned long *)
> -                                decode_register(modrm & 7, regs, 0);
> -                        else
> -                            modrm |= 0x87;
> -                        if ( !opnd_sel )
> -                        {
> -                            switch ( modrm & 7 )
> -                            {
> -                            default:
> -                                opnd_sel = read_sreg(ds);
> -                                break;
> -                            case 4: case 5:
> -                                opnd_sel = regs->ss;
> -                                break;
> -                            }
> -                        }
> -                    }
> -                    else
> -                    {
> -                        switch ( modrm & 7 )
> -                        {
> -                        case 0: case 1: case 7:
> -                            opnd_off = regs->ebx;
> -                            break;
> -                        case 6:
> -                            if ( !(modrm & 0xc0) )
> -                                modrm |= 0x80;
> -                            else
> -                        case 2: case 3:
> -                            {
> -                                opnd_off = regs->ebp;
> -                                if ( !opnd_sel )
> -                                    opnd_sel = regs->ss;
> -                            }
> -                            break;
> -                        }
> -                        if ( !opnd_sel )
> -                            opnd_sel = read_sreg(ds);
> -                        switch ( modrm & 7 )
> -                        {
> -                        case 0: case 2: case 4:
> -                            opnd_off += regs->esi;
> -                            break;
> -                        case 1: case 3: case 5:
> -                            opnd_off += regs->edi;
> -                            break;
> -                        }
> -                    }
> -                    switch ( modrm & 0xc0 )
> -                    {
> -                    case 0x40:
> -                        opnd_off += insn_fetch(s8, base, eip, limit);
> -                        break;
> -                    case 0x80:
> -                        if ( ad_bytes > 2 )
> -                            opnd_off += insn_fetch(s32, base, eip, limit);
> -                        else
> -                            opnd_off += insn_fetch(s16, base, eip, limit);
> -                        break;
> -                    }
> -                    if ( ad_bytes == 4 )
> -                        opnd_off = (unsigned int)opnd_off;
> -                    else if ( ad_bytes == 2 )
> -                        opnd_off = (unsigned short)opnd_off;
> -                    break;
> -                }
> -            }
> +            base = x86_insn_operand_ea(state, &seg);
> +            rc = gate_op_read(seg,
> +                              base + (x86_insn_opsize(state) >> 3),
> +                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
>              break;
>          }
>          break;
>      }
>  
> -    if ( jump < 0 )
> -    {
> - fail:
> -        do_guest_trap(TRAP_gp_fault, regs);
> - skip:
> -        return;
> -    }
> +    insn_len = x86_insn_length(state, &ctxt.ctxt);
> +    x86_emulate_free_state(state);
>  
> -    if ( (opnd_sel != regs->cs &&
> -          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
> -         !(ar & _SEGMENT_S) ||
> -         !(ar & _SEGMENT_P) ||
> -         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
> -    {
> -        do_guest_trap(TRAP_gp_fault, regs);
> -        return;
> -    }
> +    if ( rc == X86EMUL_EXCEPTION )

.. and here must unconditionally inject a fault, or the guest will
livelock by repeatedly taking #GP faults.

~Andrew

> +       return;
>  
> -    opnd_off += op_bytes;
> -#define ad_default ad_bytes
> -    opnd_sel = insn_fetch(u16, base, opnd_off, limit);
> -#undef ad_default
> -    if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
> +    if ( rc != X86EMUL_OKAY ||
> +         jump < 0 ||
> +         (opnd_sel & ~3) != regs->error_code ||
> +         dpl < (opnd_sel & 3) )
>      {
>          do_guest_trap(TRAP_gp_fault, regs);
>          return;
> @@ -3663,7 +3624,7 @@ static void emulate_gate_op(struct cpu_u
>              }
>          }
>          push(regs->cs);
> -        push(eip);
> +        push(regs->eip + insn_len);
>  #undef push
>          regs->esp = esp;
>          regs->ss = ss;
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -5560,6 +5560,14 @@ void x86_emulate_free_state(struct x86_e
>  }
>  #endif
>  
> +unsigned int
> +x86_insn_opsize(const struct x86_emulate_state *state)
> +{
> +    check_state(state);
> +
> +    return state->op_bytes << 3;
> +}
> +
>  int
>  x86_insn_modrm(const struct x86_emulate_state *state,
>                 unsigned int *rm, unsigned int *reg)
> @@ -5577,6 +5585,33 @@ x86_insn_modrm(const struct x86_emulate_
>      return state->modrm_mod;
>  }
>  
> +unsigned long
> +x86_insn_operand_ea(const struct x86_emulate_state *state,
> +                    enum x86_segment *seg)
> +{
> +    *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
> +
> +    check_state(state);
> +
> +    return state->ea.mem.off;
> +}
> +
> +unsigned long
> +x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
> +{
> +    check_state(state);
> +
> +    switch ( nr )
> +    {
> +    case 0:
> +        return state->imm1;
> +    case 1:
> +        return state->imm2;
> +    }
> +
> +    return 0;
> +}
> +
>  unsigned int
>  x86_insn_length(const struct x86_emulate_state *state,
>                  const struct x86_emulate_ctxt *ctxt)
> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -629,9 +629,17 @@ x86_decode_insn(
>          void *p_data, unsigned int bytes,
>          struct x86_emulate_ctxt *ctxt));
>  
> +unsigned int
> +x86_insn_opsize(const struct x86_emulate_state *state);
>  int
>  x86_insn_modrm(const struct x86_emulate_state *state,
>                 unsigned int *rm, unsigned int *reg);
> +unsigned long
> +x86_insn_operand_ea(const struct x86_emulate_state *state,
> +                    enum x86_segment *seg);
> +unsigned long
> +x86_insn_immediate(const struct x86_emulate_state *state,
> +                   unsigned int nr);
>  unsigned int
>  x86_insn_length(const struct x86_emulate_state *state,
>                  const struct x86_emulate_ctxt *ctxt);
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-06 11:56   ` Andrew Cooper
@ 2016-12-06 13:17     ` Jan Beulich
  2016-12-07 10:55       ` Andrew Cooper
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-06 13:17 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 06.12.16 at 12:56, <andrew.cooper3@citrix.com> wrote:
> On 06/12/16 11:12, Jan Beulich wrote:
>> +static int gate_op_read(
>> +    enum x86_segment seg,
>> +    unsigned long offset,
>> +    void *p_data,
>> +    unsigned int bytes,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    const struct gate_op_ctxt *goc =
>> +        container_of(ctxt, struct gate_op_ctxt, ctxt);
>> +    unsigned int rc = bytes, sel = 0;
>> +    unsigned long addr = offset, limit = 0;
>> +
>> +    switch ( seg )
>> +    {
>> +    case x86_seg_cs:
>> +        addr += goc->cs.base;
>> +        limit = goc->cs.limit;
>> +        break;
>> +    case x86_seg_ds:
>> +        sel = read_sreg(ds);
>> +        break;
>> +    case x86_seg_es:
>> +        sel = read_sreg(es);
>> +        break;
>> +    case x86_seg_fs:
>> +        sel = read_sreg(fs);
>> +        break;
>> +    case x86_seg_gs:
>> +        sel = read_sreg(gs);
>> +        break;
>> +    case x86_seg_ss:
>> +        sel = ctxt->regs->ss;
>> +        break;
>> +    default:
>> +        return X86EMUL_UNHANDLEABLE;
>> +    }
>> +    if ( sel )
>> +    {
>> +        unsigned int ar;
>> +
>> +        ASSERT(!goc->insn_fetch);
>> +        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
>> +             !(ar & _SEGMENT_S) ||
>> +             !(ar & _SEGMENT_P) ||
>> +             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
>> +            return X86EMUL_UNHANDLEABLE;
>> +        addr += offset;
>> +    }
>> +    else if ( seg != x86_seg_cs )
>> +        return X86EMUL_UNHANDLEABLE;
>> +
>> +    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
> 
> Is this correct for the zero-length instruction fetches which the
> emulator emits?  It would be better to make it safe now, than to
> introduce a latent bug in the future.

The left side of the || unconditionally fails such fetches, and that's
precisely what we want here (as tight a condition as possible)
considering that this gets used only for decoding, not for executing
instructions.

>> +        return X86EMUL_UNHANDLEABLE;
>> +
>> +    if ( is_pv_32bit_vcpu(current) )
>> +        addr = (uint32_t)addr;
> 
> This should be based on %cs attributes.  What about a 64bit PV guest in
> a compatability segment?

No - I now realize the conditional is pointless. We only do gate op
emulation for 32-bit guests.

>> +
>> +    if ( !__addr_ok(addr) ||
>> +         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
>> +    {
>> +        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
>> +                           ? PFEC_insn_fetch : 0,
>> +                           addr + bytes - rc, ctxt);
> 
> Independently to the point below, the correct predicate for setting
> PFEC_insn_fetch is ((EFER & EFER_NX) || (CR4 & SMEP))

Remember that here we're dealing with PV guests only - those don't
see any SMEP, and always run with NX enabled as long as we do
ourselves.

> However, this is subtly wrong, but I can't think of a sensible way of
> fixing it (i.e. without doing independent pagewalks).
> 
> __copy_from_user() does a data fetch, not an instruction fetch.
> 
> With the advent of PKU, processors support pages which can't be read,
> but can be executed.  Then again, our chances of ever supporting PKU for
> PV guests is slim, so maybe this point isn't very important.
> 
> As we actually do a data read, the error code should remain 0.  This
> (getting a data fetch failure for something which should have been an
> instruction fetch) will be less confusing for the guest than claiming an
> instruction fetch failure when we actually did a data fetch, as at least
> the error code will be correct for the access rights in the translation.

With the above I think this should remain as is.

>> @@ -3350,179 +3431,59 @@ static void emulate_gate_op(struct cpu_u
>>          return;
>>      }
>>  
>> -    op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
>> -    ad_default = ad_bytes = op_default;
>> -    opnd_sel = opnd_off = 0;
>> -    jump = -1;
>> -    for ( eip = regs->eip; eip - regs->_eip < 10; )
>> +    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
>> +    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
>> +    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
>> +    ctxt.insn_fetch = false;
>> +    if ( IS_ERR_OR_NULL(state) )
>> +    {
>> +        if ( PTR_ERR(state) != -X86EMUL_EXCEPTION )
>> +            do_guest_trap(TRAP_gp_fault, regs);
> 
> Here, and...
> 
>> -    if ( (opnd_sel != regs->cs &&
>> -          !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
>> -         !(ar & _SEGMENT_S) ||
>> -         !(ar & _SEGMENT_P) ||
>> -         ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
>> -    {
>> -        do_guest_trap(TRAP_gp_fault, regs);
>> -        return;
>> -    }
>> +    if ( rc == X86EMUL_EXCEPTION )
> 
> .. and here must unconditionally inject a fault, or the guest will
> livelock by repeatedly taking #GP faults.

The first one needs an else (to deliver the exception indicated by
X86EMUL_EXCEPTION) - this was an oversight during rebase over
your recent changes. And the second one is similar, just that it's
not a missing else (i.e. it's not entirely unconditional to deliver an
exception here; other failure cases are being handled right after
this).

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 2/5] x86emul: don't assume a memory operand
  2016-12-06 11:13 ` [PATCH v3 2/5] x86emul: don't assume a memory operand Jan Beulich
@ 2016-12-06 16:49   ` Andrew Cooper
  2016-12-07  8:22     ` Jan Beulich
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-06 16:49 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 06/12/16 11:13, Jan Beulich wrote:
> @@ -2359,7 +2360,7 @@ x86_decode(
>          }
>      }
>  
> -    if ( override_seg != -1 && ea.type == OP_MEM )
> +    if ( override_seg != x86_seg_none )
>          ea.mem.seg = override_seg;

Could we get away with asserting ea.type == OP_MEM if override_seg is
set, to help validate our assumptions about state?  (Possibly even
passing #UD back in the non-debug case)

Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 3/5] x86emul: generalize exception handling for rep_* hooks
  2016-12-06 11:14 ` [PATCH v3 3/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
@ 2016-12-06 17:09   ` Andrew Cooper
  0 siblings, 0 replies; 24+ messages in thread
From: Andrew Cooper @ 2016-12-06 17:09 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 06/12/16 11:14, Jan Beulich wrote:
> If any of those hooks returns X86EMUL_EXCEPTION, some register state
> should still get updated if some iterations have been performed (but
> the rIP update will get suppressed if not all of them did get handled).
> This updating is done by register_address_increment() and
> __put_rep_prefix() (which hence must no longer be bypassed). As a
> result put_rep_prefix() can then skip most of the writeback, but needs
> to ensure proper completion of the executed instruction.
>
> While on the HVM side the VA -> LA -> PA translation process ensures
> that an exception would be raised on the first iteration only, doing so
> would unduly complicate the PV side code about to be added.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional
  2016-12-06 11:15 ` [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
@ 2016-12-06 17:34   ` Andrew Cooper
  2016-12-07  8:32     ` Jan Beulich
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-06 17:34 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 06/12/16 11:15, Jan Beulich wrote:
> While the read and fetch hooks are basically unavoidable, write and
> cmpxchg aren't really needed by that many insns.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

As a corollary, please can we gain

ASSERT(ops->read && ops->fetch && ops->cpuid)

at the start of x86_emulate/decode to make it rather more obvious that
these are required.  This bit me while developing the AFL harness.

>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1492,6 +1492,8 @@ protmode_load_seg(
>      {
>          uint32_t new_desc_b = desc.b | a_flag;
>  
> +        if ( !ops->cmpxchg )
> +            return X86EMUL_UNHANDLEABLE;

Any reason this isn't a fail_if() ?

>          switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
>                                      &new_desc_b, sizeof(desc.b), ctxt)) )
>          {
> @@ -2624,13 +2626,18 @@ x86_emulate(
>          }
>          else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
>          {
> +            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
>              if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
>                                    &dst.val, dst.bytes, ctxt, ops)) )
>                  goto done;
>              dst.orig_val = dst.val;
>          }
> -        else /* Lock prefix is allowed only on RMW instructions. */
> +        else
> +        {
> +            /* Lock prefix is allowed only on RMW instructions. */
>              generate_exception_if(lock_prefix, EXC_UD);
> +            fail_if(!ops->write);

I am not sure that these two new fail_if()'s are sensibly placed here,
remote from the use of the hooks they are protecting against.

> +        }
>          break;
>      }
>  
> @@ -3334,6 +3343,7 @@ x86_emulate(
>          uint8_t depth = imm2 & 31;
>          int i;
>  
> +        fail_if(!ops->write);

This would be slighly better moved down by 3 lines to be adjacent to the
first ->write call.

>          dst.type = OP_REG;
>          dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
>          dst.reg = (unsigned long *)&_regs.ebp;
> @@ -4707,6 +4724,8 @@ x86_emulate(
>              if ( !(b & 1) )
>                  rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>                                 ea.bytes, ctxt);
> +            else
> +                fail_if(!ops->write);

Again, this wants moving closer to the ->write call.

I don't think we need to worry about partially-emulated instructions
which fail due to a lack of write.  Anything we get wrong like that will
be obvious.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-06 11:16 ` [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
  2016-12-06 11:21   ` Jan Beulich
@ 2016-12-06 19:14   ` Andrew Cooper
  2016-12-07  9:35     ` Jan Beulich
  1 sibling, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-06 19:14 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Paul Durrant

On 06/12/16 11:16, Jan Beulich wrote:
> There's a new emulator return code being added to allow bypassing
> certain operations (see the code comment).
>
> The other small tweak to the emulator is to single iteration handling
> of INS and OUTS: Since we don't want to handle any other memory access
> instructions, we want these to be handled by the rep_ins() / rep_outs()
> hooks here too.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: Re-base. Do away with the special case pointer checks on the ->read
>     and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
>     for 32-bit guests (to avoid the emulator's in_longmode() returning
>     a wrong result). Introduce and use ->validate() hook. Make
>     formatting more consistent.

Clearly EFER.LM* is a behavioural change for the guest.  Given that we
hide all other trace of 64bit-ness from 32bit PV guests, its probably a
fine change to make, but it should be called out as such.

> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
>      pv_inject_event(&event);
>  }
>  
> -static void instruction_done(
> -    struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
> +static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
>  {
>      regs->eip = eip;
>      regs->eflags &= ~X86_EFLAGS_RF;
> -    if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
> +    if ( regs->eflags & X86_EFLAGS_TF )
>      {
> -        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
> -        if ( regs->eflags & X86_EFLAGS_TF )
> -            current->arch.debugreg[6] |= DR_STEP;
> +        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;

Looking at this, I wonder how whether I should have had an explicit
DR_STEP adjustment for the singlestep work I did.

Thinking about it, I checked that the trap was raised properly, but not
that %dr6 was correct.

>          do_guest_trap(TRAP_debug, regs);
>      }
>  }
> @@ -2023,6 +2020,148 @@ static int read_gate_descriptor(unsigned
>      return 1;
>  }
>  
> +struct priv_op_ctxt {
> +    struct x86_emulate_ctxt ctxt;
> +    struct {
> +        unsigned long base, limit;
> +    } cs;
> +    char *io_emul_stub;
> +    unsigned int bpmatch;
> +    unsigned int tsc;
> +#define TSC_BASE 1
> +#define TSC_AUX 2
> +};
> +
> +static bool priv_op_to_linear(unsigned long base, unsigned long offset,
> +                              unsigned int bytes, unsigned long limit,
> +                              enum x86_segment seg,
> +                              struct x86_emulate_ctxt *ctxt,
> +                              unsigned long *addr)

This is entirely generic.  Could it be called pv_virtual_to_linear() in
case it can be reused elsewhere?

However, I'd recommend returning X86EMUL_* codes, to avoid having the
"return X86EMUL_EXCEPTION;" separate from the x86_emul_hw_exception() call.

> +{
> +    bool okay;
> +
> +    *addr = base + offset;
> +
> +    if ( ctxt->addr_size < 8 )
> +    {
> +        okay = limit >= bytes - 1 && offset <= limit - bytes + 1;
> +        *addr = (uint32_t)*addr;
> +    }
> +    else
> +        okay = __addr_ok(*addr);
> +
> +    if ( unlikely(!okay) )
> +        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
> +                                                : TRAP_stack_error,
> +                              0, ctxt);
> +
> +    return okay;
> +}
> +
> <snip>
> +static int priv_op_read_segment(enum x86_segment seg,
> +                                struct segment_register *reg,
> +                                struct x86_emulate_ctxt *ctxt)
> +{
> +    if ( ctxt->addr_size < 8 )
> +    {
> +        unsigned long limit;
> +        unsigned int sel, ar;
> +
> +        switch ( seg )
> +        {
> +        case x86_seg_cs: sel = ctxt->regs->cs; break;
> +        case x86_seg_ds: sel = read_sreg(ds);  break;
> +        case x86_seg_es: sel = read_sreg(es);  break;
> +        case x86_seg_fs: sel = read_sreg(fs);  break;
> +        case x86_seg_gs: sel = read_sreg(gs);  break;
> +        case x86_seg_ss: sel = ctxt->regs->ss; break;
> +        case x86_seg_tr:
> +            /* Check if this is an attempt to access to I/O bitmap. */
> +            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
> +                return X86EMUL_DONE;

This this leftovers from before my system-segment work?

I/O bitmap accesses are now through the ->read() hook with x86_seg_tr.

> +            /* fall through */
> +        default: return X86EMUL_UNHANDLEABLE;
> +        }
> +
> +        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
> +            return X86EMUL_UNHANDLEABLE;
> +
> +        reg->limit = limit;
> +        reg->attr.bytes = ar >> 8;
> +    }
> +    else
> +    {
> +        switch ( seg )
> +        {
> +        default:
> +            reg->base = 0;
> +            break;

This is incorrect for system segments.  If we don't intend to service
them, we should ASSERT() their absence.

> +        case x86_seg_fs:
> +            reg->base = rdfsbase();
> +            break;
> +        case x86_seg_gs:
> +            reg->base = rdgsbase();
> +            break;
> +        }
> +
> +        reg->limit = ~0U;
> +
> +        reg->attr.bytes = 0;
> +        reg->attr.fields.type = _SEGMENT_WR >> 8;
> +        if ( seg == x86_seg_cs )
> +            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
> +        reg->attr.fields.s   = 1;
> +        reg->attr.fields.dpl = 3;
> +        reg->attr.fields.p   = 1;
> +        reg->attr.fields.l   = 1;
> +        reg->attr.fields.db  = 1;
> +        reg->attr.fields.g   = 1;

This can't all be correct.  Having %cs with both D and L set is
definitely wrong in 64bit mode.

> +    }
> +
> +    /*
> +     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
> +     * Also do this for consistency for non-conforming code segments.
> +     */
> +    if ( (seg == x86_seg_ss ||
> +          (seg == x86_seg_cs &&
> +           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
> +         guest_kernel_mode(current, ctxt->regs) )
> +        reg->attr.fields.dpl = 0;

Have you run the XTF pv-iopl tests?  I'm fairly sure this simplification
will regress the behaviour of guests not using my recent
VMASST_TYPE_architectural_iopl extension.

Having said that, the old behaviour was supremely unhelpful for all
parties involved, so there is an argument to be made for altering its
behaviour.

> <snip>
> +
> +static int priv_op_read_io(unsigned int port, unsigned int bytes,
> +                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
> +{
> +    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
> +    struct vcpu *curr = current;
> +    struct domain *currd = current->domain;
> +
> +    /* INS must not come here. */
> +    ASSERT((ctxt->opcode & ~9) == 0xe4);

In this case, I think it would be better to have:

if ( unlikely((ctxt->opcode & ~9) != 0xe4) )
{
    /* INS must not come here.  Only tolerate IN. */
    ASSERT_UNREACHABLE();
    return X86EMUL_UNHANDLEABLE;
}

So we still bail in release builds.

> +
> +    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
> +        return X86EMUL_UNHANDLEABLE;
> +
> +    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
> +
> +    if ( admin_io_okay(port, bytes, currd) )
> +    {
> +        io_emul_stub_t *io_emul =
> +            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
> +
> +        mark_regs_dirty(ctxt->regs);
> +        io_emul(ctxt->regs);
> +        return X86EMUL_DONE;
> +    }
> +
> +    *val = guest_io_read(port, bytes, currd);
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +static int priv_op_write_io(unsigned int port, unsigned int bytes,
> +                            unsigned long val, struct x86_emulate_ctxt *ctxt)
> +{
> +    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
> +    struct vcpu *curr = current;
> +    struct domain *currd = current->domain;
> +
> +    /* OUTS must not come here. */
> +    ASSERT((ctxt->opcode & ~9) == 0xe6);

Similarly here.

>  int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
>                    unsigned int *edx, struct x86_emulate_ctxt *ctxt)
>  {
>      struct cpu_user_regs regs = *ctxt->regs;
>  
> +    /*
> +     * x86_emulate uses this function to query CPU features for its own
> +     * internal use. Make sure we're actually emulating CPUID before checking
> +     * for emulated CPUID faulting.
> +     */
> +    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
> +    {
> +        const struct vcpu *curr = current;
> +
> +        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
> +        if ( curr->arch.cpuid_faulting &&
> +             !guest_kernel_mode(curr, ctxt->regs) )
> +            return X86EMUL_UNHANDLEABLE;

I don't think this is conceptually correct.  The emulator should
explicitly raise #GP[0] here, rather than relying on the calling
behaviour of bouncing the pending exception back to the caller.

Also, please double check with the XTF CPUID faulting tests if you
haven't done so already, but it does look like this maintains the
correct guest-observable behaviour.

> <snip>
> +    case 0x6c ... 0x6f: /* ins / outs */
> +    case 0xe4 ... 0xe7: /* in / out (immediate port) */
> +    case 0xec ... 0xef: /* in / out (port in %dx) */
> +    case X86EMUL_OPC(0x0f, 0x06): /* clts */
> +    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
> +    case X86EMUL_OPC(0x0f, 0x20) ...
> +         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
> +    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
> +    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
> +    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
> +    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
> +        return X86EMUL_OKAY;
>  
> -        case 0x6e: /* OUTSB */
> -            op_bytes = 1;
> -        case 0x6f: /* OUTSW/OUTSL */
> -            if ( (data_limit < (op_bytes - 1)) ||
> -                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
> -                  !guest_io_okay(port, op_bytes, v, regs) )
> -                goto fail;
> -            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
> -                                      op_bytes)) != 0 )
> -            {
> -                pv_inject_page_fault(0, data_base + rd_ad(esi)
> -                                     + op_bytes - rc);
> -                return EXCRET_fault_fixed;
> -            }
> -            guest_io_write(port, op_bytes, data, currd);
> -            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
> -                                         ? -op_bytes : op_bytes));
> +    case 0xfa: case 0xfb: /* cli / sti */
> +        if ( !iopl_ok(current, ctxt->regs) )

Given the use of DONE below, this part should also explicitly raise
#GP[0], rather than falling through to UNHANDLEABLE, as we are providing
an emulation of the instruction.

> <snip>
> +        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
> +             (modrm_rm & 7) != 1 )
>              break;
> -        case 0xd1: /* XSETBV */
> +        switch ( modrm_reg & 7 )
>          {
> -            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
> +        case 2: /* xgetbv */

You appear to have altered XSETBV for XGETBV here.  XGETBV wasn't
previously emulated, and doesn't have any condition (we care about
emulating) which suffers #GP.

> <snip>
> +    case X86EMUL_EXCEPTION:
> +        pv_inject_event(&ctxt.ctxt.event);
> +        return EXCRET_fault_fixed;
>      }
>  
> -#undef wr_ad
> -#undef rd_ad
> -
> - done:
> -    instruction_done(regs, eip, bpmatch);
> - skip:
> -    if ( io_emul_stub )
> -        unmap_domain_page(io_emul_stub);
> -    return EXCRET_fault_fixed;
> -
> - fail:
> -    if ( io_emul_stub )
> -        unmap_domain_page(io_emul_stub);
>      return 0;

This return can be dropped.  There are no paths out of the switch
statement, and dropping this return will cause the compiler to notice if
one gets introduced in the future.

>  }
>  
> @@ -3599,7 +3677,7 @@ static void emulate_gate_op(struct cpu_u
>          sel |= (regs->cs & 3);
>  
>      regs->cs = sel;
> -    instruction_done(regs, off, 0);
> +    instruction_done(regs, off);
>  }
>  
>  void do_general_protection(struct cpu_user_regs *regs)
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1182,7 +1182,7 @@ static int ioport_access_check(
>  
>      fail_if(ops->read_segment == NULL);
>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
> -        return rc;
> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;

Ah, so the real purpose of having read_segment(x86_seg_tr, ...) return
X86EMUL_DONE is to defer the port check into the hooks themselves.

This doesn't actually match your intended meaning of X86EMUL_DONE, or
the documentation you provide below.  I think it would be cleaner in
this case to have an input x86_emulate_ctxt boolean requesting to skip
%tr ioport checks, which causes ioport_access_check() to exit early with
X86EMUL_OKAY.

> <snip>
> +struct x86_emulate_state;
> +
>  /*
>   * These operations represent the instruction emulator's interface to memory,
>   * I/O ports, privileged state... pretty much everything other than GPRs.
> @@ -239,6 +249,13 @@ struct x86_emulate_ops
>          struct x86_emulate_ctxt *ctxt);
>  
>      /*
> +     * validate: Post-decode hook to allow caller controlled filtering.

"Post-decode, pre-emulate hook...", to specify exactly where it lives.

~Andrew

> +     */
> +    int (*validate)(
> +        const struct x86_emulate_state *state,
> +        struct x86_emulate_ctxt *ctxt);
> +
> +    /*
>       * rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
>       *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
>       *  @reps:  [IN ] Maximum repetitions to be emulated.
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 2/5] x86emul: don't assume a memory operand
  2016-12-06 16:49   ` Andrew Cooper
@ 2016-12-07  8:22     ` Jan Beulich
  2016-12-07 13:05       ` Andrew Cooper
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-07  8:22 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 06.12.16 at 17:49, <andrew.cooper3@citrix.com> wrote:
> On 06/12/16 11:13, Jan Beulich wrote:
>> @@ -2359,7 +2360,7 @@ x86_decode(
>>          }
>>      }
>>  
>> -    if ( override_seg != -1 && ea.type == OP_MEM )
>> +    if ( override_seg != x86_seg_none )
>>          ea.mem.seg = override_seg;
> 
> Could we get away with asserting ea.type == OP_MEM if override_seg is
> set, to help validate our assumptions about state?  (Possibly even
> passing #UD back in the non-debug case)

That would be wrong - we'd be asserting guest controlled state.
There's nothing preventing a segment override to be present on
instructions without memory operands. And for example string
insns don't have OP_MEM set despite having (implicit) memory
operands (after all that's the hole reason for the change here
[but not the patch as a hole], as the following PV priv-op patch
requires the segment override to take effect on OUTS). Nor
would such be correct for conditional branches, where some of
the segment overrides have a different meaning (necessarily
ignored by the emulator).

> Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Let me know of the applicability of this in the light of the above.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional
  2016-12-06 17:34   ` Andrew Cooper
@ 2016-12-07  8:32     ` Jan Beulich
  2016-12-07 13:36       ` Andrew Cooper
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-07  8:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 06.12.16 at 18:34, <andrew.cooper3@citrix.com> wrote:
> On 06/12/16 11:15, Jan Beulich wrote:
>> While the read and fetch hooks are basically unavoidable, write and
>> cmpxchg aren't really needed by that many insns.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> As a corollary, please can we gain
> 
> ASSERT(ops->read && ops->fetch && ops->cpuid)
> 
> at the start of x86_emulate/decode to make it rather more obvious that
> these are required.  This bit me while developing the AFL harness.

Well, not exactly: The ->read hok formally isn't required by the
decoder, so I'd prefer to assert its presence on x86_emulate(). And
I don't see why the ->cpuid() hook would be required all of the
sudden - all its uses are guarded by a NULL check.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -1492,6 +1492,8 @@ protmode_load_seg(
>>      {
>>          uint32_t new_desc_b = desc.b | a_flag;
>>  
>> +        if ( !ops->cmpxchg )
>> +            return X86EMUL_UNHANDLEABLE;
> 
> Any reason this isn't a fail_if() ?

Right, it certainly can be made one.

>> @@ -2624,13 +2626,18 @@ x86_emulate(
>>          }
>>          else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
>>          {
>> +            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
>>              if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
>>                                    &dst.val, dst.bytes, ctxt, ops)) )
>>                  goto done;
>>              dst.orig_val = dst.val;
>>          }
>> -        else /* Lock prefix is allowed only on RMW instructions. */
>> +        else
>> +        {
>> +            /* Lock prefix is allowed only on RMW instructions. */
>>              generate_exception_if(lock_prefix, EXC_UD);
>> +            fail_if(!ops->write);
> 
> I am not sure that these two new fail_if()'s are sensibly placed here,
> remote from the use of the hooks they are protecting against.

Well - I don't see a point continuing the emulation attempt in that
case. They're being duplicated in the writeback code already
anyway, for safety reasons.

>> @@ -3334,6 +3343,7 @@ x86_emulate(
>>          uint8_t depth = imm2 & 31;
>>          int i;
>>  
>> +        fail_if(!ops->write);
> 
> This would be slighly better moved down by 3 lines to be adjacent to the
> first ->write call.

I can certainly do this, but ...

>> @@ -4707,6 +4724,8 @@ x86_emulate(
>>              if ( !(b & 1) )
>>                  rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>>                                 ea.bytes, ctxt);
>> +            else
>> +                fail_if(!ops->write);
> 
> Again, this wants moving closer to the ->write call.
> 
> I don't think we need to worry about partially-emulated instructions
> which fail due to a lack of write.  Anything we get wrong like that will
> be obvious.

... I'm rather hesitant to do what you ask for here: I'm of the
opinion that we shouldn't alter machine state (MMX/XMM/YMM
registers) in that case. Could you live with it staying here and
an ASSERT() being added right ahead of the use?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling
  2016-12-06 19:14   ` Andrew Cooper
@ 2016-12-07  9:35     ` Jan Beulich
  0 siblings, 0 replies; 24+ messages in thread
From: Jan Beulich @ 2016-12-07  9:35 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Paul Durrant

>>> On 06.12.16 at 20:14, <andrew.cooper3@citrix.com> wrote:
> On 06/12/16 11:16, Jan Beulich wrote:
>> @@ -2023,6 +2020,148 @@ static int read_gate_descriptor(unsigned
>>      return 1;
>>  }
>>  
>> +struct priv_op_ctxt {
>> +    struct x86_emulate_ctxt ctxt;
>> +    struct {
>> +        unsigned long base, limit;
>> +    } cs;
>> +    char *io_emul_stub;
>> +    unsigned int bpmatch;
>> +    unsigned int tsc;
>> +#define TSC_BASE 1
>> +#define TSC_AUX 2
>> +};
>> +
>> +static bool priv_op_to_linear(unsigned long base, unsigned long offset,
>> +                              unsigned int bytes, unsigned long limit,
>> +                              enum x86_segment seg,
>> +                              struct x86_emulate_ctxt *ctxt,
>> +                              unsigned long *addr)
> 
> This is entirely generic.  Could it be called pv_virtual_to_linear() in
> case it can be reused elsewhere?

Well, it's not entirely generic, as it requires an emulator context
structure. It could be used by other emulator hooks. Therefore I'm
not sure the suggested name is suitable - I'll use
pv_emul_virt_to_linear() instead. And I'll move it ahead of the
struct priv_op_ctxt declaration, to demonstrate it has no
dependency on it.

>> +static int priv_op_read_segment(enum x86_segment seg,
>> +                                struct segment_register *reg,
>> +                                struct x86_emulate_ctxt *ctxt)
>> +{
>> +    if ( ctxt->addr_size < 8 )
>> +    {
>> +        unsigned long limit;
>> +        unsigned int sel, ar;
>> +
>> +        switch ( seg )
>> +        {
>> +        case x86_seg_cs: sel = ctxt->regs->cs; break;
>> +        case x86_seg_ds: sel = read_sreg(ds);  break;
>> +        case x86_seg_es: sel = read_sreg(es);  break;
>> +        case x86_seg_fs: sel = read_sreg(fs);  break;
>> +        case x86_seg_gs: sel = read_sreg(gs);  break;
>> +        case x86_seg_ss: sel = ctxt->regs->ss; break;
>> +        case x86_seg_tr:
>> +            /* Check if this is an attempt to access to I/O bitmap. */
>> +            if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
>> +                return X86EMUL_DONE;
> 
> This this leftovers from before my system-segment work?
> 
> I/O bitmap accesses are now through the ->read() hook with x86_seg_tr.

No, the first thing ioport_access_check() does is a read_segment().
Plus there's no functional ->read() hook being installed by
emulate_privileged_op() in the first place.

>> +            /* fall through */
>> +        default: return X86EMUL_UNHANDLEABLE;
>> +        }
>> +
>> +        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
>> +            return X86EMUL_UNHANDLEABLE;
>> +
>> +        reg->limit = limit;
>> +        reg->attr.bytes = ar >> 8;
>> +    }
>> +    else
>> +    {
>> +        switch ( seg )
>> +        {
>> +        default:
>> +            reg->base = 0;
>> +            break;
> 
> This is incorrect for system segments.  If we don't intend to service
> them, we should ASSERT() their absence.

Well, to be in line with the 32-bit code I'd rather make this an
X86EMUL_UNHANDLEABLE return.

>> +        case x86_seg_fs:
>> +            reg->base = rdfsbase();
>> +            break;
>> +        case x86_seg_gs:
>> +            reg->base = rdgsbase();
>> +            break;
>> +        }
>> +
>> +        reg->limit = ~0U;
>> +
>> +        reg->attr.bytes = 0;
>> +        reg->attr.fields.type = _SEGMENT_WR >> 8;
>> +        if ( seg == x86_seg_cs )
>> +            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
>> +        reg->attr.fields.s   = 1;
>> +        reg->attr.fields.dpl = 3;
>> +        reg->attr.fields.p   = 1;
>> +        reg->attr.fields.l   = 1;
>> +        reg->attr.fields.db  = 1;
>> +        reg->attr.fields.g   = 1;
> 
> This can't all be correct.  Having %cs with both D and L set is
> definitely wrong in 64bit mode.

Oh, of course - while it's fine the way it is, I'll better set exactly
one of them (depending on whether it's CS).

>> +    }
>> +
>> +    /*
>> +     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
>> +     * Also do this for consistency for non-conforming code segments.
>> +     */
>> +    if ( (seg == x86_seg_ss ||
>> +          (seg == x86_seg_cs &&
>> +           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
>> +         guest_kernel_mode(current, ctxt->regs) )
>> +        reg->attr.fields.dpl = 0;
> 
> Have you run the XTF pv-iopl tests?  I'm fairly sure this simplification
> will regress the behaviour of guests not using my recent
> VMASST_TYPE_architectural_iopl extension.

I didn't run that test yet, but I also don't see where you think a
regression could slip in here: State is being faked for x86_emulate()
here, invisible to the guest.

>> +static int priv_op_read_io(unsigned int port, unsigned int bytes,
>> +                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
>> +{
>> +    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
>> +    struct vcpu *curr = current;
>> +    struct domain *currd = current->domain;
>> +
>> +    /* INS must not come here. */
>> +    ASSERT((ctxt->opcode & ~9) == 0xe4);
> 
> In this case, I think it would be better to have:
> 
> if ( unlikely((ctxt->opcode & ~9) != 0xe4) )
> {
>     /* INS must not come here.  Only tolerate IN. */
>     ASSERT_UNREACHABLE();
>     return X86EMUL_UNHANDLEABLE;
> }
> 
> So we still bail in release builds.

I don't see the point: The missing ->write() hook will have the same
effect (and the fake ->read() one for the OUTS equivalent).

>>  int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
>>                    unsigned int *edx, struct x86_emulate_ctxt *ctxt)
>>  {
>>      struct cpu_user_regs regs = *ctxt->regs;
>>  
>> +    /*
>> +     * x86_emulate uses this function to query CPU features for its own
>> +     * internal use. Make sure we're actually emulating CPUID before checking
>> +     * for emulated CPUID faulting.
>> +     */
>> +    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
>> +    {
>> +        const struct vcpu *curr = current;
>> +
>> +        /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
>> +        if ( curr->arch.cpuid_faulting &&
>> +             !guest_kernel_mode(curr, ctxt->regs) )
>> +            return X86EMUL_UNHANDLEABLE;
> 
> I don't think this is conceptually correct.  The emulator should
> explicitly raise #GP[0] here, rather than relying on the calling
> behaviour of bouncing the pending exception back to the caller.

Well, I think it's okay either way, but I've changed it to
X86EMUL_EXCEPTION. This will go away anyway with your
other series.

>> +    case 0x6c ... 0x6f: /* ins / outs */
>> +    case 0xe4 ... 0xe7: /* in / out (immediate port) */
>> +    case 0xec ... 0xef: /* in / out (port in %dx) */
>> +    case X86EMUL_OPC(0x0f, 0x06): /* clts */
>> +    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
>> +    case X86EMUL_OPC(0x0f, 0x20) ...
>> +         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
>> +    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
>> +    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
>> +    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
>> +    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
>> +        return X86EMUL_OKAY;
>>  
>> -        case 0x6e: /* OUTSB */
>> -            op_bytes = 1;
>> -        case 0x6f: /* OUTSW/OUTSL */
>> -            if ( (data_limit < (op_bytes - 1)) ||
>> -                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
>> -                  !guest_io_okay(port, op_bytes, v, regs) )
>> -                goto fail;
>> -            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
>> -                                      op_bytes)) != 0 )
>> -            {
>> -                pv_inject_page_fault(0, data_base + rd_ad(esi)
>> -                                     + op_bytes - rc);
>> -                return EXCRET_fault_fixed;
>> -            }
>> -            guest_io_write(port, op_bytes, data, currd);
>> -            wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
>> -                                         ? -op_bytes : op_bytes));
>> +    case 0xfa: case 0xfb: /* cli / sti */
>> +        if ( !iopl_ok(current, ctxt->regs) )
> 
> Given the use of DONE below, this part should also explicitly raise
> #GP[0], rather than falling through to UNHANDLEABLE, as we are providing
> an emulation of the instruction.

We're providing an emulation only if iopl_ok(), so I don't see the
need. The way it's done now matches previous behavior.

>> +        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
>> +             (modrm_rm & 7) != 1 )
>>              break;
>> -        case 0xd1: /* XSETBV */
>> +        switch ( modrm_reg & 7 )
>>          {
>> -            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
>> +        case 2: /* xgetbv */
> 
> You appear to have altered XSETBV for XGETBV here.  XGETBV wasn't
> previously emulated, and doesn't have any condition (we care about
> emulating) which suffers #GP.

Indeed the comment was wrong, but the code was correct (0xd1
[ = 0b11010001] decodes to modrm_rm = 1 and modrm_reg = 2).

>> +    case X86EMUL_EXCEPTION:
>> +        pv_inject_event(&ctxt.ctxt.event);
>> +        return EXCRET_fault_fixed;
>>      }
>>  
>> -#undef wr_ad
>> -#undef rd_ad
>> -
>> - done:
>> -    instruction_done(regs, eip, bpmatch);
>> - skip:
>> -    if ( io_emul_stub )
>> -        unmap_domain_page(io_emul_stub);
>> -    return EXCRET_fault_fixed;
>> -
>> - fail:
>> -    if ( io_emul_stub )
>> -        unmap_domain_page(io_emul_stub);
>>      return 0;
> 
> This return can be dropped.  There are no paths out of the switch
> statement, and dropping this return will cause the compiler to notice if
> one gets introduced in the future.

I disagree - X86EMUL_UNHANDLEABLE comes here (as well as any
potential bogus return values).

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -1182,7 +1182,7 @@ static int ioport_access_check(
>>  
>>      fail_if(ops->read_segment == NULL);
>>      if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
>> -        return rc;
>> +        return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
> 
> Ah, so the real purpose of having read_segment(x86_seg_tr, ...) return
> X86EMUL_DONE is to defer the port check into the hooks themselves.
> 
> This doesn't actually match your intended meaning of X86EMUL_DONE, or
> the documentation you provide below.  I think it would be cleaner in
> this case to have an input x86_emulate_ctxt boolean requesting to skip
> %tr ioport checks, which causes ioport_access_check() to exit early with
> X86EMUL_OKAY.

How does it not match the intended or documented meaning of
DONE? There's nothing said that this means to skip all further
emulation. Instead the meaning for the different hooks this is
valid to be used in is being described explicitly. I don't think
another input would be a good idea here; in particular I'd like
to avoid adding new inputs where possible, as that always
requires updating all existing callers of x86_emulate().

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-06 13:17     ` Jan Beulich
@ 2016-12-07 10:55       ` Andrew Cooper
  2016-12-07 11:23         ` Jan Beulich
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-07 10:55 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 06/12/16 13:17, Jan Beulich wrote:
>>>> On 06.12.16 at 12:56, <andrew.cooper3@citrix.com> wrote:
>> On 06/12/16 11:12, Jan Beulich wrote:
>>> +static int gate_op_read(
>>> +    enum x86_segment seg,
>>> +    unsigned long offset,
>>> +    void *p_data,
>>> +    unsigned int bytes,
>>> +    struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    const struct gate_op_ctxt *goc =
>>> +        container_of(ctxt, struct gate_op_ctxt, ctxt);
>>> +    unsigned int rc = bytes, sel = 0;
>>> +    unsigned long addr = offset, limit = 0;
>>> +
>>> +    switch ( seg )
>>> +    {
>>> +    case x86_seg_cs:
>>> +        addr += goc->cs.base;
>>> +        limit = goc->cs.limit;
>>> +        break;
>>> +    case x86_seg_ds:
>>> +        sel = read_sreg(ds);
>>> +        break;
>>> +    case x86_seg_es:
>>> +        sel = read_sreg(es);
>>> +        break;
>>> +    case x86_seg_fs:
>>> +        sel = read_sreg(fs);
>>> +        break;
>>> +    case x86_seg_gs:
>>> +        sel = read_sreg(gs);
>>> +        break;
>>> +    case x86_seg_ss:
>>> +        sel = ctxt->regs->ss;
>>> +        break;
>>> +    default:
>>> +        return X86EMUL_UNHANDLEABLE;
>>> +    }
>>> +    if ( sel )
>>> +    {
>>> +        unsigned int ar;
>>> +
>>> +        ASSERT(!goc->insn_fetch);
>>> +        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
>>> +             !(ar & _SEGMENT_S) ||
>>> +             !(ar & _SEGMENT_P) ||
>>> +             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
>>> +            return X86EMUL_UNHANDLEABLE;
>>> +        addr += offset;
>>> +    }
>>> +    else if ( seg != x86_seg_cs )
>>> +        return X86EMUL_UNHANDLEABLE;
>>> +
>>> +    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
>> Is this correct for the zero-length instruction fetches which the
>> emulator emits?  It would be better to make it safe now, than to
>> introduce a latent bug in the future.
> The left side of the || unconditionally fails such fetches, and that's
> precisely what we want here (as tight a condition as possible)
> considering that this gets used only for decoding, not for executing
> instructions.

Mind adding a similar comment to patch 5 along the lines of?

/* We don't mean to emulate any branches. */

>
>>> +        return X86EMUL_UNHANDLEABLE;
>>> +
>>> +    if ( is_pv_32bit_vcpu(current) )
>>> +        addr = (uint32_t)addr;
>> This should be based on %cs attributes.  What about a 64bit PV guest in
>> a compatability segment?
> No - I now realize the conditional is pointless. We only do gate op
> emulation for 32-bit guests.

Good point.  Do we actually ASSERT() this anywhere?  It looks like this
properly is only a side effect of emulate_gate_op()'s caller.

>
>>> +
>>> +    if ( !__addr_ok(addr) ||
>>> +         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
>>> +    {
>>> +        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
>>> +                           ? PFEC_insn_fetch : 0,
>>> +                           addr + bytes - rc, ctxt);
>> Independently to the point below, the correct predicate for setting
>> PFEC_insn_fetch is ((EFER & EFER_NX) || (CR4 & SMEP))
> Remember that here we're dealing with PV guests only - those don't
> see any SMEP, and always run with NX enabled as long as we do
> ourselves.

Conceptually, this is wrong.

For PV guests, we should always base paging decisions on what is really
in EFER and CR4, not on a theoretical guests view of the world.  For
pagefaults which we decide isn't relevant to Xen, the propagated fault
will be using the real EFER and CR4.

In this case (being 32bit only) SMEP is indeed disabled in the context
of the guest, but for 64bit PV guests, Xen's SMEP should be considered,
as it will affect the guests view of hardware.  However, ...

>> However, this is subtly wrong, but I can't think of a sensible way of
>> fixing it (i.e. without doing independent pagewalks).
>>
>> __copy_from_user() does a data fetch, not an instruction fetch.
>>
>> With the advent of PKU, processors support pages which can't be read,
>> but can be executed.  Then again, our chances of ever supporting PKU for
>> PV guests is slim, so maybe this point isn't very important.
>>
>> As we actually do a data read, the error code should remain 0.  This
>> (getting a data fetch failure for something which should have been an
>> instruction fetch) will be less confusing for the guest than claiming an
>> instruction fetch failure when we actually did a data fetch, as at least
>> the error code will be correct for the access rights in the translation.
> With the above I think this should remain as is.

... Why?  The problem is still unresolved.

__copy_from_user() will successfully read a bytestream which was
protected in the pagetables by the NX bit, because it performs a data
read not an instruction fetch.

In the case of a fault occurred, Xen should report to the guest that it
performed a data read, not an instruction fetch, because that's what it
actually did.

A guest might rightly expect an instruction fetch fault if it was paying
close attention, but we already currently provide a data read failure,
and this is better than fabricating a fault of what Xen should have done
(but didn't).

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-07 10:55       ` Andrew Cooper
@ 2016-12-07 11:23         ` Jan Beulich
  2016-12-07 11:39           ` Andrew Cooper
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-07 11:23 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 07.12.16 at 11:55, <andrew.cooper3@citrix.com> wrote:
> On 06/12/16 13:17, Jan Beulich wrote:
>>>>> On 06.12.16 at 12:56, <andrew.cooper3@citrix.com> wrote:
>>> On 06/12/16 11:12, Jan Beulich wrote:
>>>> +        return X86EMUL_UNHANDLEABLE;
>>>> +
>>>> +    if ( is_pv_32bit_vcpu(current) )
>>>> +        addr = (uint32_t)addr;
>>> This should be based on %cs attributes.  What about a 64bit PV guest in
>>> a compatability segment?
>> No - I now realize the conditional is pointless. We only do gate op
>> emulation for 32-bit guests.
> 
> Good point.  Do we actually ASSERT() this anywhere?  It looks like this
> properly is only a side effect of emulate_gate_op()'s caller.

Why would we assert this anywhere here, when this is just a helper
for emulate_gate_op()? I think I've said so elsewhere not too long
ago - ASSERT()s are fine, but we shouldn't go overboard with them.

>>>> +
>>>> +    if ( !__addr_ok(addr) ||
>>>> +         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
>>>> +    {
>>>> +        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
>>>> +                           ? PFEC_insn_fetch : 0,
>>>> +                           addr + bytes - rc, ctxt);
>>> Independently to the point below, the correct predicate for setting
>>> PFEC_insn_fetch is ((EFER & EFER_NX) || (CR4 & SMEP))
>> Remember that here we're dealing with PV guests only - those don't
>> see any SMEP, and always run with NX enabled as long as we do
>> ourselves.
> 
> Conceptually, this is wrong.

I agree, but the wrongness is limited to "conceptually", as in
practice for PV guests we make certain implications anyway.

> For PV guests, we should always base paging decisions on what is really
> in EFER and CR4, not on a theoretical guests view of the world.  For
> pagefaults which we decide isn't relevant to Xen, the propagated fault
> will be using the real EFER and CR4.
> 
> In this case (being 32bit only) SMEP is indeed disabled in the context
> of the guest, but for 64bit PV guests, Xen's SMEP should be considered,
> as it will affect the guests view of hardware.  However, ...
> 
>>> However, this is subtly wrong, but I can't think of a sensible way of
>>> fixing it (i.e. without doing independent pagewalks).
>>>
>>> __copy_from_user() does a data fetch, not an instruction fetch.
>>>
>>> With the advent of PKU, processors support pages which can't be read,
>>> but can be executed.  Then again, our chances of ever supporting PKU for
>>> PV guests is slim, so maybe this point isn't very important.
>>>
>>> As we actually do a data read, the error code should remain 0.  This
>>> (getting a data fetch failure for something which should have been an
>>> instruction fetch) will be less confusing for the guest than claiming an
>>> instruction fetch failure when we actually did a data fetch, as at least
>>> the error code will be correct for the access rights in the translation.
>> With the above I think this should remain as is.
> 
> ... Why?  The problem is still unresolved.
> 
> __copy_from_user() will successfully read a bytestream which was
> protected in the pagetables by the NX bit, because it performs a data
> read not an instruction fetch.
> 
> In the case of a fault occurred, Xen should report to the guest that it
> performed a data read, not an instruction fetch, because that's what it
> actually did.
> 
> A guest might rightly expect an instruction fetch fault if it was paying
> close attention, but we already currently provide a data read failure,
> and this is better than fabricating a fault of what Xen should have done
> (but didn't).

Okay - I think I need to ask you to stop mixing what this patch does
with what may want to be changed, but is orthogonal: We've used
copy_from_user() before this patch, so the problem you're pointing
out is pre-existing and should not be fixed in this patch. If you think
it needs fixing at all, it should be another, follow-up patch.

Which leaves us with deciding what error code to use: Of course we
could stick with not surfacing PFEC_insn_fetch, but that feels rather
wrong as long as we're talking about instruction fetches. But if you
insist, I could agree to change it on the basis that the original code
also didn't get this right.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation
  2016-12-07 11:23         ` Jan Beulich
@ 2016-12-07 11:39           ` Andrew Cooper
  0 siblings, 0 replies; 24+ messages in thread
From: Andrew Cooper @ 2016-12-07 11:39 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 07/12/16 11:23, Jan Beulich wrote:
>>>> On 07.12.16 at 11:55, <andrew.cooper3@citrix.com> wrote:
>> On 06/12/16 13:17, Jan Beulich wrote:
>>>>>> On 06.12.16 at 12:56, <andrew.cooper3@citrix.com> wrote:
>>>> On 06/12/16 11:12, Jan Beulich wrote:
>>>>> +        return X86EMUL_UNHANDLEABLE;
>>>>> +
>>>>> +    if ( is_pv_32bit_vcpu(current) )
>>>>> +        addr = (uint32_t)addr;
>>>> This should be based on %cs attributes.  What about a 64bit PV guest in
>>>> a compatability segment?
>>> No - I now realize the conditional is pointless. We only do gate op
>>> emulation for 32-bit guests.
>> Good point.  Do we actually ASSERT() this anywhere?  It looks like this
>> properly is only a side effect of emulate_gate_op()'s caller.
> Why would we assert this anywhere here, when this is just a helper
> for emulate_gate_op()? I think I've said so elsewhere not too long
> ago - ASSERT()s are fine, but we shouldn't go overboard with them.
>
>>>>> +
>>>>> +    if ( !__addr_ok(addr) ||
>>>>> +         (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
>>>>> +    {
>>>>> +        x86_emul_pagefault(goc->insn_fetch && cpu_has_nx
>>>>> +                           ? PFEC_insn_fetch : 0,
>>>>> +                           addr + bytes - rc, ctxt);
>>>> Independently to the point below, the correct predicate for setting
>>>> PFEC_insn_fetch is ((EFER & EFER_NX) || (CR4 & SMEP))
>>> Remember that here we're dealing with PV guests only - those don't
>>> see any SMEP, and always run with NX enabled as long as we do
>>> ourselves.
>> Conceptually, this is wrong.
> I agree, but the wrongness is limited to "conceptually", as in
> practice for PV guests we make certain implications anyway.
>
>> For PV guests, we should always base paging decisions on what is really
>> in EFER and CR4, not on a theoretical guests view of the world.  For
>> pagefaults which we decide isn't relevant to Xen, the propagated fault
>> will be using the real EFER and CR4.
>>
>> In this case (being 32bit only) SMEP is indeed disabled in the context
>> of the guest, but for 64bit PV guests, Xen's SMEP should be considered,
>> as it will affect the guests view of hardware.  However, ...
>>
>>>> However, this is subtly wrong, but I can't think of a sensible way of
>>>> fixing it (i.e. without doing independent pagewalks).
>>>>
>>>> __copy_from_user() does a data fetch, not an instruction fetch.
>>>>
>>>> With the advent of PKU, processors support pages which can't be read,
>>>> but can be executed.  Then again, our chances of ever supporting PKU for
>>>> PV guests is slim, so maybe this point isn't very important.
>>>>
>>>> As we actually do a data read, the error code should remain 0.  This
>>>> (getting a data fetch failure for something which should have been an
>>>> instruction fetch) will be less confusing for the guest than claiming an
>>>> instruction fetch failure when we actually did a data fetch, as at least
>>>> the error code will be correct for the access rights in the translation.
>>> With the above I think this should remain as is.
>> ... Why?  The problem is still unresolved.
>>
>> __copy_from_user() will successfully read a bytestream which was
>> protected in the pagetables by the NX bit, because it performs a data
>> read not an instruction fetch.
>>
>> In the case of a fault occurred, Xen should report to the guest that it
>> performed a data read, not an instruction fetch, because that's what it
>> actually did.
>>
>> A guest might rightly expect an instruction fetch fault if it was paying
>> close attention, but we already currently provide a data read failure,
>> and this is better than fabricating a fault of what Xen should have done
>> (but didn't).
> Okay - I think I need to ask you to stop mixing what this patch does
> with what may want to be changed, but is orthogonal: We've used
> copy_from_user() before this patch, so the problem you're pointing
> out is pre-existing and should not be fixed in this patch. If you think
> it needs fixing at all, it should be another, follow-up patch.
>
> Which leaves us with deciding what error code to use: Of course we
> could stick with not surfacing PFEC_insn_fetch, but that feels rather
> wrong as long as we're talking about instruction fetches. But if you
> insist, I could agree to change it on the basis that the original code
> also didn't get this right.

Sorry if it was not clear, but my objection was to this patch changing
the error code (as this makes Xen's behaviour even more wrong than it
was before), not the continued use of copy_from_user() (which I don't
see a resonable alternative to at the moment).

If you don't mind, please keep this patch maintaining the previous
behaviour, and we can address the pagefault behaviour separately.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 2/5] x86emul: don't assume a memory operand
  2016-12-07  8:22     ` Jan Beulich
@ 2016-12-07 13:05       ` Andrew Cooper
  2016-12-07 13:14         ` Jan Beulich
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-07 13:05 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 07/12/16 08:22, Jan Beulich wrote:
>>>> On 06.12.16 at 17:49, <andrew.cooper3@citrix.com> wrote:
>> On 06/12/16 11:13, Jan Beulich wrote:
>>> @@ -2359,7 +2360,7 @@ x86_decode(
>>>          }
>>>      }
>>>  
>>> -    if ( override_seg != -1 && ea.type == OP_MEM )
>>> +    if ( override_seg != x86_seg_none )
>>>          ea.mem.seg = override_seg;
>> Could we get away with asserting ea.type == OP_MEM if override_seg is
>> set, to help validate our assumptions about state?  (Possibly even
>> passing #UD back in the non-debug case)
> That would be wrong - we'd be asserting guest controlled state.
> There's nothing preventing a segment override to be present on
> instructions without memory operands.

True, but such overrides should have no effect on the instruction.

> And for example string
> insns don't have OP_MEM set despite having (implicit) memory
> operands (after all that's the hole reason for the change here
> [but not the patch as a hole], as the following PV priv-op patch

Do you mean whole instead of hole here?

> requires the segment override to take effect on OUTS). Nor
> would such be correct for conditional branches, where some of
> the segment overrides have a different meaning (necessarily
> ignored by the emulator).

The point I am trying to address is that it feels wrong to be
referencing ea.mem for non OP_MEM types.  I accept that, no longer being
a union, this use should be safe.

The string instructions are certainly a spanner in the works, but the
jump instructions need not care about likely/unlikely prefixes.  They
were purely advisory to start with, and are ignored on all modern hardware.

Another spanner in the works is the upcoming meaning of %ds overrides
for jump instructions as the no-track hint from the forthcoming
Control-flow Enforcement extensions.

Given that there can only ever be one active segment override, does it
need to be stored in ea.mem in the first place, or could it live
somewhere else which is mem-neutral?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 2/5] x86emul: don't assume a memory operand
  2016-12-07 13:05       ` Andrew Cooper
@ 2016-12-07 13:14         ` Jan Beulich
  2016-12-07 13:15           ` Andrew Cooper
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Beulich @ 2016-12-07 13:14 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 07.12.16 at 14:05, <andrew.cooper3@citrix.com> wrote:
> On 07/12/16 08:22, Jan Beulich wrote:
>>>>> On 06.12.16 at 17:49, <andrew.cooper3@citrix.com> wrote:
>>> On 06/12/16 11:13, Jan Beulich wrote:
>>>> @@ -2359,7 +2360,7 @@ x86_decode(
>>>>          }
>>>>      }
>>>>  
>>>> -    if ( override_seg != -1 && ea.type == OP_MEM )
>>>> +    if ( override_seg != x86_seg_none )
>>>>          ea.mem.seg = override_seg;
>>> Could we get away with asserting ea.type == OP_MEM if override_seg is
>>> set, to help validate our assumptions about state?  (Possibly even
>>> passing #UD back in the non-debug case)
>> That would be wrong - we'd be asserting guest controlled state.
>> There's nothing preventing a segment override to be present on
>> instructions without memory operands.
> 
> True, but such overrides should have no effect on the instruction.

Correct. But we can't ASSERT() anything for that reason.

>> And for example string
>> insns don't have OP_MEM set despite having (implicit) memory
>> operands (after all that's the hole reason for the change here
>> [but not the patch as a hole], as the following PV priv-op patch
> 
> Do you mean whole instead of hole here?

Yes - shame on me, even twice the same mistake.

>> requires the segment override to take effect on OUTS). Nor
>> would such be correct for conditional branches, where some of
>> the segment overrides have a different meaning (necessarily
>> ignored by the emulator).
> 
> The point I am trying to address is that it feels wrong to be
> referencing ea.mem for non OP_MEM types.  I accept that, no longer being
> a union, this use should be safe.
> 
> The string instructions are certainly a spanner in the works, but the
> jump instructions need not care about likely/unlikely prefixes.  They
> were purely advisory to start with, and are ignored on all modern hardware.
> 
> Another spanner in the works is the upcoming meaning of %ds overrides
> for jump instructions as the no-track hint from the forthcoming
> Control-flow Enforcement extensions.
> 
> Given that there can only ever be one active segment override, does it
> need to be stored in ea.mem in the first place, or could it live
> somewhere else which is mem-neutral?

It could, but that wouldn't make ea.mem.seg go away, and it's a
convenient place to put it. Also if we put it elsewhere we'd have
to (a) change all existing uses (including all places structure-
assigning dst or src from ea), and be rather careful with future
changes.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 2/5] x86emul: don't assume a memory operand
  2016-12-07 13:14         ` Jan Beulich
@ 2016-12-07 13:15           ` Andrew Cooper
  0 siblings, 0 replies; 24+ messages in thread
From: Andrew Cooper @ 2016-12-07 13:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 07/12/16 13:14, Jan Beulich wrote:
>>>> On 07.12.16 at 14:05, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/16 08:22, Jan Beulich wrote:
>>>>>> On 06.12.16 at 17:49, <andrew.cooper3@citrix.com> wrote:
>>>> On 06/12/16 11:13, Jan Beulich wrote:
>>>>> @@ -2359,7 +2360,7 @@ x86_decode(
>>>>>          }
>>>>>      }
>>>>>  
>>>>> -    if ( override_seg != -1 && ea.type == OP_MEM )
>>>>> +    if ( override_seg != x86_seg_none )
>>>>>          ea.mem.seg = override_seg;
>>>> Could we get away with asserting ea.type == OP_MEM if override_seg is
>>>> set, to help validate our assumptions about state?  (Possibly even
>>>> passing #UD back in the non-debug case)
>>> That would be wrong - we'd be asserting guest controlled state.
>>> There's nothing preventing a segment override to be present on
>>> instructions without memory operands.
>> True, but such overrides should have no effect on the instruction.
> Correct. But we can't ASSERT() anything for that reason.
>
>>> And for example string
>>> insns don't have OP_MEM set despite having (implicit) memory
>>> operands (after all that's the hole reason for the change here
>>> [but not the patch as a hole], as the following PV priv-op patch
>> Do you mean whole instead of hole here?
> Yes - shame on me, even twice the same mistake.
>
>>> requires the segment override to take effect on OUTS). Nor
>>> would such be correct for conditional branches, where some of
>>> the segment overrides have a different meaning (necessarily
>>> ignored by the emulator).
>> The point I am trying to address is that it feels wrong to be
>> referencing ea.mem for non OP_MEM types.  I accept that, no longer being
>> a union, this use should be safe.
>>
>> The string instructions are certainly a spanner in the works, but the
>> jump instructions need not care about likely/unlikely prefixes.  They
>> were purely advisory to start with, and are ignored on all modern hardware.
>>
>> Another spanner in the works is the upcoming meaning of %ds overrides
>> for jump instructions as the no-track hint from the forthcoming
>> Control-flow Enforcement extensions.
>>
>> Given that there can only ever be one active segment override, does it
>> need to be stored in ea.mem in the first place, or could it live
>> somewhere else which is mem-neutral?
> It could, but that wouldn't make ea.mem.seg go away, and it's a
> convenient place to put it. Also if we put it elsewhere we'd have
> to (a) change all existing uses (including all places structure-
> assigning dst or src from ea), and be rather careful with future
> changes.

Ok.  For now, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>,
but I would like to see if we can improve this in the future.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional
  2016-12-07  8:32     ` Jan Beulich
@ 2016-12-07 13:36       ` Andrew Cooper
  2016-12-07 13:54         ` Jan Beulich
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Cooper @ 2016-12-07 13:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 07/12/16 08:32, Jan Beulich wrote:
>>>> On 06.12.16 at 18:34, <andrew.cooper3@citrix.com> wrote:
>> On 06/12/16 11:15, Jan Beulich wrote:
>>> While the read and fetch hooks are basically unavoidable, write and
>>> cmpxchg aren't really needed by that many insns.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> As a corollary, please can we gain
>>
>> ASSERT(ops->read && ops->fetch && ops->cpuid)
>>
>> at the start of x86_emulate/decode to make it rather more obvious that
>> these are required.  This bit me while developing the AFL harness.
> Well, not exactly: The ->read hok formally isn't required by the
> decoder, so I'd prefer to assert its presence on x86_emulate().

Ok.

> And I don't see why the ->cpuid() hook would be required all of the
> sudden - all its uses are guarded by a NULL check.

You made it convincing argument in c/s 043ad80 "x86: always supply
.cpuid() handler to x86_emulate()", as to why the ->cpuid() hook should
be expected.

Especially if we are going to be adding more CPUID checks going forward,
I would take this opportunity to make it mandatory.

>
>>> @@ -2624,13 +2626,18 @@ x86_emulate(
>>>          }
>>>          else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
>>>          {
>>> +            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
>>>              if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
>>>                                    &dst.val, dst.bytes, ctxt, ops)) )
>>>                  goto done;
>>>              dst.orig_val = dst.val;
>>>          }
>>> -        else /* Lock prefix is allowed only on RMW instructions. */
>>> +        else
>>> +        {
>>> +            /* Lock prefix is allowed only on RMW instructions. */
>>>              generate_exception_if(lock_prefix, EXC_UD);
>>> +            fail_if(!ops->write);
>> I am not sure that these two new fail_if()'s are sensibly placed here,
>> remote from the use of the hooks they are protecting against.
> Well - I don't see a point continuing the emulation attempt in that
> case. They're being duplicated in the writeback code already
> anyway, for safety reasons.

Ok.  How about some comment indicating "bail early if we won't be able
to complete the operation" ?

>
>>> @@ -4707,6 +4724,8 @@ x86_emulate(
>>>              if ( !(b & 1) )
>>>                  rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>>>                                 ea.bytes, ctxt);
>>> +            else
>>> +                fail_if(!ops->write);
>> Again, this wants moving closer to the ->write call.
>>
>> I don't think we need to worry about partially-emulated instructions
>> which fail due to a lack of write.  Anything we get wrong like that will
>> be obvious.
> ... I'm rather hesitant to do what you ask for here: I'm of the
> opinion that we shouldn't alter machine state (MMX/XMM/YMM
> registers) in that case.

Fair point.

> Could you live with it staying here and an ASSERT() being added right ahead of the use?

Yes, or indeed just a comment indicating why the check is early.  After
all, its still in the same emulation block and we can reasonably expect
people to consider the block as a whole.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional
  2016-12-07 13:36       ` Andrew Cooper
@ 2016-12-07 13:54         ` Jan Beulich
  0 siblings, 0 replies; 24+ messages in thread
From: Jan Beulich @ 2016-12-07 13:54 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 07.12.16 at 14:36, <andrew.cooper3@citrix.com> wrote:
> On 07/12/16 08:32, Jan Beulich wrote:
>> And I don't see why the ->cpuid() hook would be required all of the
>> sudden - all its uses are guarded by a NULL check.
> 
> You made it convincing argument in c/s 043ad80 "x86: always supply
> .cpuid() handler to x86_emulate()", as to why the ->cpuid() hook should
> be expected.
> 
> Especially if we are going to be adding more CPUID checks going forward,
> I would take this opportunity to make it mandatory.

But not in this patch, no. The more that such a patch then should
also remove all the respective NULL checks. Plus at least
x86_decode_insn() invokes x86_decode() without cpuid hook, so
further care would be needed. Overall I'm not convinced it is a
good idea to close the road for x86_emulate() uses without that
hook - see the "as long as respective instructions may get used in
that case" in the description of the commit you point out.

>>>> @@ -2624,13 +2626,18 @@ x86_emulate(
>>>>          }
>>>>          else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
>>>>          {
>>>> +            fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
>>>>              if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
>>>>                                    &dst.val, dst.bytes, ctxt, ops)) )
>>>>                  goto done;
>>>>              dst.orig_val = dst.val;
>>>>          }
>>>> -        else /* Lock prefix is allowed only on RMW instructions. */
>>>> +        else
>>>> +        {
>>>> +            /* Lock prefix is allowed only on RMW instructions. */
>>>>              generate_exception_if(lock_prefix, EXC_UD);
>>>> +            fail_if(!ops->write);
>>> I am not sure that these two new fail_if()'s are sensibly placed here,
>>> remote from the use of the hooks they are protecting against.
>> Well - I don't see a point continuing the emulation attempt in that
>> case. They're being duplicated in the writeback code already
>> anyway, for safety reasons.
> 
> Ok.  How about some comment indicating "bail early if we won't be able
> to complete the operation" ?

I can add a comment, but the wording you suggest doesn't seem
appropriate, due to my further earlier argumentation (it being a
bad idea to update other guest visible machine state). As I now
realize that wouldn't in fact be a problem here, as the instructions
writing to memory we emulate so far don't update other machine
state, but it would still set a bad precedent, as there are others
which do. And I really hope to get at least a good part of SSE and
AVX done by the time 4.9 freezes.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2016-12-07 13:54 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-06 11:07 [PATCH v3 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
2016-12-06 11:12 ` [PATCH v3 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
2016-12-06 11:56   ` Andrew Cooper
2016-12-06 13:17     ` Jan Beulich
2016-12-07 10:55       ` Andrew Cooper
2016-12-07 11:23         ` Jan Beulich
2016-12-07 11:39           ` Andrew Cooper
2016-12-06 11:13 ` [PATCH v3 2/5] x86emul: don't assume a memory operand Jan Beulich
2016-12-06 16:49   ` Andrew Cooper
2016-12-07  8:22     ` Jan Beulich
2016-12-07 13:05       ` Andrew Cooper
2016-12-07 13:14         ` Jan Beulich
2016-12-07 13:15           ` Andrew Cooper
2016-12-06 11:14 ` [PATCH v3 3/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
2016-12-06 17:09   ` Andrew Cooper
2016-12-06 11:15 ` [PATCH v3 4/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
2016-12-06 17:34   ` Andrew Cooper
2016-12-07  8:32     ` Jan Beulich
2016-12-07 13:36       ` Andrew Cooper
2016-12-07 13:54         ` Jan Beulich
2016-12-06 11:16 ` [PATCH v3 5/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
2016-12-06 11:21   ` Jan Beulich
2016-12-06 19:14   ` Andrew Cooper
2016-12-07  9:35     ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.