* [PATCH for-next 4/7] x86/traps: move all PV emulation and hypercalls to pv/traps.c
2017-04-06 17:14 [PATCH for-next 0/7] Refactor x86 trap handling code Wei Liu
` (2 preceding siblings ...)
2017-04-06 17:14 ` [PATCH for-next 3/7] x86/traps: lift do_guest_trap to traps.h Wei Liu
@ 2017-04-06 17:14 ` Wei Liu
2017-04-21 9:25 ` Jan Beulich
2017-04-06 17:14 ` [PATCH for-next 5/7] x86_64: move PV specific code under pv/x86_64 Wei Liu
` (2 subsequent siblings)
6 siblings, 1 reply; 21+ messages in thread
From: Wei Liu @ 2017-04-06 17:14 UTC (permalink / raw)
To: Xen-devel; +Cc: Andrew Cooper, Wei Liu, Jan Beulich
Move the following emulation code:
1. invalid op
2. rdtsc
3. privilege instructions
4. gate operation
Move the following hypercalls:
1. do_set_trap_table
2. do_set_debugreg
3. do_get_debugreg
4. do_fpu_taskswitch
5. do_set_trap_table
Some helper functions which are PV only are also moved.
The code movement requires making a functions non-static. They are added
to traps.h.
No functional change.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
xen/arch/x86/pv/Makefile | 1 +
xen/arch/x86/pv/traps.c | 2152 +++++++++++++++++++++++++++++++++++++++
xen/arch/x86/traps.c | 2357 +++----------------------------------------
xen/include/asm-x86/traps.h | 19 +
4 files changed, 2295 insertions(+), 2234 deletions(-)
create mode 100644 xen/arch/x86/pv/traps.c
diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile
index d8fc13f6fe..c996dcc149 100644
--- a/xen/arch/x86/pv/Makefile
+++ b/xen/arch/x86/pv/Makefile
@@ -3,3 +3,4 @@ subdir-y += compat
obj-y += hypercall.o
obj-bin-y += dom0_build.init.o
obj-bin-y += entry.o
+obj-y += traps.o
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
new file mode 100644
index 0000000000..0f09d858f6
--- /dev/null
+++ b/xen/arch/x86/pv/traps.c
@@ -0,0 +1,2152 @@
+/******************************************************************************
+ * arch/x86/pv/traps.c
+ *
+ * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <acpi/acpi.h>
+
+#include <xen/event.h>
+#include <xen/guest_access.h>
+#include <xen/iocap.h>
+#include <xen/lib.h>
+#include <xen/paging.h>
+#include <xen/spinlock.h>
+#include <xen/types.h>
+#include <xen/trace.h>
+#include <xsm/xsm.h>
+
+#include <asm/debugreg.h>
+#include <asm/hpet.h>
+#include <asm/mc146818rtc.h>
+#include <asm/regs.h>
+#include <asm/shared.h>
+#include <asm/traps.h>
+#include <asm/x86_emulate.h>
+
+long unregister_guest_nmi_callback(void)
+{
+ struct vcpu *v = current;
+ struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi];
+
+ memset(t, 0, sizeof(*t));
+
+ return 0;
+}
+
+int guest_has_trap_callback(struct domain *d, uint16_t vcpuid,
+ unsigned int trap_nr)
+{
+ struct vcpu *v;
+ struct trap_info *t;
+
+ BUG_ON(d == NULL);
+ BUG_ON(vcpuid >= d->max_vcpus);
+
+ /* Sanity check - XXX should be more fine grained. */
+ BUG_ON(trap_nr >= NR_VECTORS);
+
+ v = d->vcpu[vcpuid];
+ t = &v->arch.pv_vcpu.trap_ctxt[trap_nr];
+
+ return (t->address != 0);
+}
+
+void pv_inject_event(const struct x86_event *event)
+{
+ struct vcpu *v = current;
+ struct cpu_user_regs *regs = guest_cpu_user_regs();
+ struct trap_bounce *tb;
+ const struct trap_info *ti;
+ const uint8_t vector = event->vector;
+ const bool use_error_code =
+ ((vector < 32) && (TRAP_HAVE_EC & (1u << vector)));
+ unsigned int error_code = event->error_code;
+
+ ASSERT(vector == event->vector); /* Confirm no truncation. */
+ if ( use_error_code )
+ ASSERT(error_code != X86_EVENT_NO_EC);
+ else
+ ASSERT(error_code == X86_EVENT_NO_EC);
+
+ tb = &v->arch.pv_vcpu.trap_bounce;
+ ti = &v->arch.pv_vcpu.trap_ctxt[vector];
+
+ tb->flags = TBF_EXCEPTION;
+ tb->cs = ti->cs;
+ tb->eip = ti->address;
+
+ if ( vector == TRAP_page_fault )
+ {
+ v->arch.pv_vcpu.ctrlreg[2] = event->cr2;
+ arch_set_cr2(v, event->cr2);
+
+ /* Re-set error_code.user flag appropriately for the guest. */
+ error_code &= ~PFEC_user_mode;
+ if ( !guest_kernel_mode(v, regs) )
+ error_code |= PFEC_user_mode;
+
+ trace_pv_page_fault(event->cr2, error_code);
+ }
+ else
+ trace_pv_trap(vector, regs->rip, use_error_code, error_code);
+
+ if ( use_error_code )
+ {
+ tb->flags |= TBF_EXCEPTION_ERRCODE;
+ tb->error_code = error_code;
+ }
+
+ if ( TI_GET_IF(ti) )
+ tb->flags |= TBF_INTERRUPT;
+
+ if ( unlikely(null_trap_bounce(v, tb)) )
+ {
+ gprintk(XENLOG_WARNING,
+ "Unhandled %s fault/trap [#%d, ec=%04x]\n",
+ trapstr(vector), vector, error_code);
+
+ if ( vector == TRAP_page_fault )
+ show_page_walk(event->cr2);
+ }
+}
+
+static void instruction_done(struct cpu_user_regs *regs, unsigned long rip)
+{
+ regs->rip = rip;
+ regs->eflags &= ~X86_EFLAGS_RF;
+ if ( regs->eflags & X86_EFLAGS_TF )
+ {
+ current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
+ pv_inject_guest_trap(TRAP_debug, regs);
+ }
+}
+
+int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
+{
+ char opcode[3];
+ unsigned long eip, rc;
+ struct vcpu *v = current;
+
+ eip = regs->rip;
+ if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
+ {
+ pv_inject_page_fault(0, eip + sizeof(opcode) - rc);
+ return EXCRET_fault_fixed;
+ }
+ if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
+ return 0;
+ eip += sizeof(opcode);
+ pv_soft_rdtsc(v, regs, 1);
+ instruction_done(regs, eip);
+ return EXCRET_fault_fixed;
+}
+
+int emulate_forced_invalid_op(struct cpu_user_regs *regs)
+{
+ char sig[5], instr[2];
+ unsigned long eip, rc;
+ struct cpuid_leaf res;
+
+ eip = regs->rip;
+
+ /* Check for forced emulation signature: ud2 ; .ascii "xen". */
+ if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
+ {
+ pv_inject_page_fault(0, eip + sizeof(sig) - rc);
+ return EXCRET_fault_fixed;
+ }
+ if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
+ return 0;
+ eip += sizeof(sig);
+
+ /* We only emulate CPUID. */
+ if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
+ {
+ pv_inject_page_fault(0, eip + sizeof(instr) - rc);
+ return EXCRET_fault_fixed;
+ }
+ if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
+ return 0;
+
+ /* If cpuid faulting is enabled and CPL>0 inject a #GP in place of #UD. */
+ if ( current->arch.cpuid_faulting && !guest_kernel_mode(current, regs) )
+ {
+ regs->rip = eip;
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return EXCRET_fault_fixed;
+ }
+
+ eip += sizeof(instr);
+
+ guest_cpuid(current, regs->eax, regs->ecx, &res);
+
+ regs->rax = res.a;
+ regs->rbx = res.b;
+ regs->rcx = res.c;
+ regs->rdx = res.d;
+
+ instruction_done(regs, eip);
+
+ trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->rip);
+
+ return EXCRET_fault_fixed;
+}
+
+static unsigned int check_guest_io_breakpoint(struct vcpu *v,
+ unsigned int port, unsigned int len)
+{
+ unsigned int width, i, match = 0;
+ unsigned long start;
+
+ if ( !(v->arch.debugreg[5]) ||
+ !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
+ return 0;
+
+ for ( i = 0; i < 4; i++ )
+ {
+ if ( !(v->arch.debugreg[5] &
+ (3 << (i * DR_ENABLE_SIZE))) )
+ continue;
+
+ start = v->arch.debugreg[i];
+ width = 0;
+
+ switch ( (v->arch.debugreg[7] >>
+ (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
+ {
+ case DR_LEN_1: width = 1; break;
+ case DR_LEN_2: width = 2; break;
+ case DR_LEN_4: width = 4; break;
+ case DR_LEN_8: width = 8; break;
+ }
+
+ if ( (start < (port + len)) && ((start + width) > port) )
+ match |= 1 << i;
+ }
+
+ return match;
+}
+
+/*
+ * Called from asm to set up the MCE trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+int set_guest_machinecheck_trapbounce(void)
+{
+ struct vcpu *v = current;
+ struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
+
+ pv_inject_guest_trap(TRAP_machine_check, guest_cpu_user_regs());
+ tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
+ return !null_trap_bounce(v, tb);
+}
+
+/*
+ * Called from asm to set up the NMI trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+int set_guest_nmi_trapbounce(void)
+{
+ struct vcpu *v = current;
+ struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
+ pv_inject_guest_trap(TRAP_nmi, guest_cpu_user_regs());
+ tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
+ return !null_trap_bounce(v, tb);
+}
+
+long do_set_trap_table(XEN_GUEST_HANDLE_PARAM(const_trap_info_t) traps)
+{
+ struct trap_info cur;
+ struct vcpu *curr = current;
+ struct trap_info *dst = curr->arch.pv_vcpu.trap_ctxt;
+ long rc = 0;
+
+ /* If no table is presented then clear the entire virtual IDT. */
+ if ( guest_handle_is_null(traps) )
+ {
+ memset(dst, 0, NR_VECTORS * sizeof(*dst));
+ init_int80_direct_trap(curr);
+ return 0;
+ }
+
+ for ( ; ; )
+ {
+ if ( copy_from_guest(&cur, traps, 1) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+
+ if ( cur.address == 0 )
+ break;
+
+ if ( !is_canonical_address(cur.address) )
+ return -EINVAL;
+
+ fixup_guest_code_selector(curr->domain, cur.cs);
+
+ memcpy(&dst[cur.vector], &cur, sizeof(cur));
+
+ if ( cur.vector == 0x80 )
+ init_int80_direct_trap(curr);
+
+ guest_handle_add_offset(traps, 1);
+
+ if ( hypercall_preempt_check() )
+ {
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_set_trap_table, "h", traps);
+ break;
+ }
+ }
+
+ return rc;
+}
+
+long do_set_debugreg(int reg, unsigned long value)
+{
+ return set_debugreg(current, reg, value);
+}
+
+unsigned long do_get_debugreg(int reg)
+{
+ struct vcpu *curr = current;
+
+ switch ( reg )
+ {
+ case 0 ... 3:
+ case 6:
+ return curr->arch.debugreg[reg];
+ case 7:
+ return (curr->arch.debugreg[7] |
+ curr->arch.debugreg[5]);
+ case 4 ... 5:
+ return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
+ curr->arch.debugreg[reg + 2] : 0);
+ }
+
+ return -EINVAL;
+}
+
+long do_fpu_taskswitch(int set)
+{
+ struct vcpu *v = current;
+
+ if ( set )
+ {
+ v->arch.pv_vcpu.ctrlreg[0] |= X86_CR0_TS;
+ stts();
+ }
+ else
+ {
+ v->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
+ if ( v->fpu_dirtied )
+ clts();
+ }
+
+ return 0;
+}
+
+static int read_descriptor(unsigned int sel,
+ const struct vcpu *v,
+ unsigned long *base,
+ unsigned long *limit,
+ unsigned int *ar,
+ bool_t insn_fetch)
+{
+ struct desc_struct desc;
+
+ if ( sel < 4)
+ desc.b = desc.a = 0;
+ else if ( __get_user(desc,
+ (const struct desc_struct *)(!(sel & 4)
+ ? GDT_VIRT_START(v)
+ : LDT_VIRT_START(v))
+ + (sel >> 3)) )
+ return 0;
+ if ( !insn_fetch )
+ desc.b &= ~_SEGMENT_L;
+
+ *ar = desc.b & 0x00f0ff00;
+ if ( !(desc.b & _SEGMENT_L) )
+ {
+ *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
+ (desc.b & 0xff000000));
+ *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
+ if ( desc.b & _SEGMENT_G )
+ *limit = ((*limit + 1) << 12) - 1;
+#ifndef NDEBUG
+ if ( sel > 3 )
+ {
+ unsigned int a, l;
+ unsigned char valid;
+
+ asm volatile (
+ "larl %2,%0 ; setz %1"
+ : "=r" (a), "=qm" (valid) : "rm" (sel));
+ BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
+ asm volatile (
+ "lsll %2,%0 ; setz %1"
+ : "=r" (l), "=qm" (valid) : "rm" (sel));
+ BUG_ON(valid && (l != *limit));
+ }
+#endif
+ }
+ else
+ {
+ *base = 0UL;
+ *limit = ~0UL;
+ }
+
+ return 1;
+}
+
+static int read_gate_descriptor(unsigned int gate_sel,
+ const struct vcpu *v,
+ unsigned int *sel,
+ unsigned long *off,
+ unsigned int *ar)
+{
+ struct desc_struct desc;
+ const struct desc_struct *pdesc;
+
+
+ pdesc = (const struct desc_struct *)
+ (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
+ + (gate_sel >> 3);
+ if ( (gate_sel < 4) ||
+ ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
+ __get_user(desc, pdesc) )
+ return 0;
+
+ *sel = (desc.a >> 16) & 0x0000fffc;
+ *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
+ *ar = desc.b & 0x0000ffff;
+
+ /*
+ * check_descriptor() clears the DPL field and stores the
+ * guest requested DPL in the selector's RPL field.
+ */
+ if ( *ar & _SEGMENT_DPL )
+ return 0;
+ *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
+
+ if ( !is_pv_32bit_vcpu(v) )
+ {
+ if ( (*ar & 0x1f00) != 0x0c00 ||
+ (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
+ __get_user(desc, pdesc + 1) ||
+ (desc.b & 0x1f00) )
+ return 0;
+
+ *off |= (unsigned long)desc.a << 32;
+ return 1;
+ }
+
+ switch ( *ar & 0x1f00 )
+ {
+ case 0x0400:
+ *off &= 0xffff;
+ break;
+ case 0x0c00:
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+ unsigned int bytes, unsigned long limit,
+ enum x86_segment seg,
+ struct x86_emulate_ctxt *ctxt,
+ unsigned long *addr)
+{
+ int rc = X86EMUL_OKAY;
+
+ *addr = base + offset;
+
+ if ( ctxt->addr_size < 64 )
+ {
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ rc = X86EMUL_EXCEPTION;
+ *addr = (uint32_t)*addr;
+ }
+ else if ( !__addr_ok(*addr) )
+ rc = X86EMUL_EXCEPTION;
+
+ if ( unlikely(rc == X86EMUL_EXCEPTION) )
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+
+ return rc;
+}
+
+struct priv_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ char *io_emul_stub;
+ unsigned int bpmatch;
+ unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct priv_op_ctxt *poc =
+ container_of(ctxt, struct priv_op_ctxt, ctxt);
+ unsigned int rc;
+ unsigned long addr = poc->cs.base + offset;
+
+ ASSERT(seg == x86_seg_cs);
+
+ /* We don't mean to emulate any branches. */
+ if ( !bytes )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+ x86_seg_cs, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+ {
+ /*
+ * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+ * cpu_has_nx, but we'd then need a "fetch" variant of
+ * __copy_from_user() respecting NX, SMEP, and protection keys.
+ */
+ x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+ struct segment_register *reg,
+ struct x86_emulate_ctxt *ctxt)
+{
+ /* Check if this is an attempt to access the I/O bitmap. */
+ if ( seg == x86_seg_tr )
+ {
+ switch ( ctxt->opcode )
+ {
+ case 0x6c ... 0x6f: /* ins / outs */
+ case 0xe4 ... 0xe7: /* in / out (immediate port) */
+ case 0xec ... 0xef: /* in / out (port in %dx) */
+ /* Defer the check to priv_op_{read,write}_io(). */
+ return X86EMUL_DONE;
+ }
+ }
+
+ if ( ctxt->addr_size < 64 )
+ {
+ unsigned long limit;
+ unsigned int sel, ar;
+
+ switch ( seg )
+ {
+ case x86_seg_cs: sel = ctxt->regs->cs; break;
+ case x86_seg_ds: sel = read_sreg(ds); break;
+ case x86_seg_es: sel = read_sreg(es); break;
+ case x86_seg_fs: sel = read_sreg(fs); break;
+ case x86_seg_gs: sel = read_sreg(gs); break;
+ case x86_seg_ss: sel = ctxt->regs->ss; break;
+ default: return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) )
+ return X86EMUL_UNHANDLEABLE;
+
+ reg->limit = limit;
+ reg->attr.bytes = ar >> 8;
+ }
+ else
+ {
+ switch ( seg )
+ {
+ default:
+ if ( !is_x86_user_segment(seg) )
+ return X86EMUL_UNHANDLEABLE;
+ reg->base = 0;
+ break;
+ case x86_seg_fs:
+ reg->base = rdfsbase();
+ break;
+ case x86_seg_gs:
+ reg->base = rdgsbase();
+ break;
+ }
+
+ reg->limit = ~0U;
+
+ reg->attr.bytes = 0;
+ reg->attr.fields.type = _SEGMENT_WR >> 8;
+ if ( seg == x86_seg_cs )
+ {
+ reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+ reg->attr.fields.l = 1;
+ }
+ else
+ reg->attr.fields.db = 1;
+ reg->attr.fields.s = 1;
+ reg->attr.fields.dpl = 3;
+ reg->attr.fields.p = 1;
+ reg->attr.fields.g = 1;
+ }
+
+ /*
+ * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+ * Also do this for consistency for non-conforming code segments.
+ */
+ if ( (seg == x86_seg_ss ||
+ (seg == x86_seg_cs &&
+ !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+ guest_kernel_mode(current, ctxt->regs) )
+ reg->attr.fields.dpl = 0;
+
+ return X86EMUL_OKAY;
+}
+
+/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
+static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
+{
+ unsigned int cpl = guest_kernel_mode(v, regs) ?
+ (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
+
+ ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
+
+ return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
+}
+
+/* Has the guest requested sufficient permission for this I/O access? */
+static int guest_io_okay(
+ unsigned int port, unsigned int bytes,
+ struct vcpu *v, struct cpu_user_regs *regs)
+{
+ /* If in user mode, switch to kernel mode just to read I/O bitmap. */
+ int user_mode = !(v->arch.flags & TF_kernel_mode);
+#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
+
+ if ( iopl_ok(v, regs) )
+ return 1;
+
+ if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
+ {
+ union { uint8_t bytes[2]; uint16_t mask; } x;
+
+ /*
+ * Grab permission bytes from guest space. Inaccessible bytes are
+ * read as 0xff (no access allowed).
+ */
+ TOGGLE_MODE();
+ switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
+ port>>3, 2) )
+ {
+ default: x.bytes[0] = ~0;
+ /* fallthrough */
+ case 1: x.bytes[1] = ~0;
+ /* fallthrough */
+ case 0: break;
+ }
+ TOGGLE_MODE();
+
+ if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Has the administrator granted sufficient permission for this I/O access? */
+static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
+ const struct domain *d)
+{
+ /*
+ * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
+ * We never permit direct access to that register.
+ */
+ if ( (port == 0xcf8) && (bytes == 4) )
+ return 0;
+
+ /* We also never permit direct access to the RTC/CMOS registers. */
+ if ( ((port & ~1) == RTC_PORT(0)) )
+ return 0;
+
+ return ioports_access_permitted(d, port, port + bytes - 1);
+}
+
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
+ unsigned int size, uint32_t *write)
+{
+ uint32_t machine_bdf;
+
+ if ( !is_hardware_domain(currd) )
+ return 0;
+
+ if ( !CF8_ENABLED(currd->arch.pci_cf8) )
+ return 1;
+
+ machine_bdf = CF8_BDF(currd->arch.pci_cf8);
+ if ( write )
+ {
+ const unsigned long *ro_map = pci_get_ro_map(0);
+
+ if ( ro_map && test_bit(machine_bdf, ro_map) )
+ return 0;
+ }
+ start |= CF8_ADDR_LO(currd->arch.pci_cf8);
+ /* AMD extended configuration space access? */
+ if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
+ {
+ uint64_t msr_val;
+
+ if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
+ return 0;
+ if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
+ start |= CF8_ADDR_HI(currd->arch.pci_cf8);
+ }
+
+ return !write ?
+ xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+ start, start + size - 1, 0) == 0 :
+ pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
+}
+
+uint32_t guest_io_read(unsigned int port, unsigned int bytes,
+ struct domain *currd)
+{
+ uint32_t data = 0;
+ unsigned int shift = 0;
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ switch ( bytes )
+ {
+ case 1: return inb(port);
+ case 2: return inw(port);
+ case 4: return inl(port);
+ }
+ }
+
+ while ( bytes != 0 )
+ {
+ unsigned int size = 1;
+ uint32_t sub_data = ~0;
+
+ if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+ {
+ sub_data = pv_pit_handler(port, 0, 0);
+ }
+ else if ( port == RTC_PORT(0) )
+ {
+ sub_data = currd->arch.cmos_idx;
+ }
+ else if ( (port == RTC_PORT(1)) &&
+ ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+ sub_data = inb(RTC_PORT(1));
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ }
+ else if ( (port == 0xcf8) && (bytes == 4) )
+ {
+ size = 4;
+ sub_data = currd->arch.pci_cf8;
+ }
+ else if ( (port & 0xfffc) == 0xcfc )
+ {
+ size = min(bytes, 4 - (port & 3));
+ if ( size == 3 )
+ size = 2;
+ if ( pci_cfg_ok(currd, port & 3, size, NULL) )
+ sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
+ }
+
+ if ( size == 4 )
+ return sub_data;
+
+ data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
+ shift += size * 8;
+ port += size;
+ bytes -= size;
+ }
+
+ return data;
+}
+
+void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
+ struct domain *currd)
+{
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ switch ( bytes ) {
+ case 1:
+ outb((uint8_t)data, port);
+ if ( pv_post_outb_hook )
+ pv_post_outb_hook(port, (uint8_t)data);
+ break;
+ case 2:
+ outw((uint16_t)data, port);
+ break;
+ case 4:
+ outl(data, port);
+ break;
+ }
+ return;
+ }
+
+ while ( bytes != 0 )
+ {
+ unsigned int size = 1;
+
+ if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+ {
+ pv_pit_handler(port, (uint8_t)data, 1);
+ }
+ else if ( port == RTC_PORT(0) )
+ {
+ currd->arch.cmos_idx = data;
+ }
+ else if ( (port == RTC_PORT(1)) &&
+ ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+ {
+ unsigned long flags;
+
+ if ( pv_rtc_handler )
+ pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
+ spin_lock_irqsave(&rtc_lock, flags);
+ outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+ outb(data, RTC_PORT(1));
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ }
+ else if ( (port == 0xcf8) && (bytes == 4) )
+ {
+ size = 4;
+ currd->arch.pci_cf8 = data;
+ }
+ else if ( (port & 0xfffc) == 0xcfc )
+ {
+ size = min(bytes, 4 - (port & 3));
+ if ( size == 3 )
+ size = 2;
+ if ( pci_cfg_ok(currd, port & 3, size, &data) )
+ pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
+ }
+
+ if ( size == 4 )
+ return;
+
+ port += size;
+ bytes -= size;
+ data >>= size * 8;
+ }
+}
+
+/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
+void host_to_guest_gpr_switch(struct cpu_user_regs *);
+unsigned long guest_to_host_gpr_switch(unsigned long);
+
+void (*pv_post_outb_hook)(unsigned int port, u8 value);
+
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+ unsigned int port, unsigned int bytes)
+{
+ if ( !ctxt->io_emul_stub )
+ ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+ (this_cpu(stubs.addr) &
+ ~PAGE_MASK) +
+ STUB_BUF_SIZE / 2;
+
+ /* movq $host_to_guest_gpr_switch,%rcx */
+ ctxt->io_emul_stub[0] = 0x48;
+ ctxt->io_emul_stub[1] = 0xb9;
+ *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+ /* callq *%rcx */
+ ctxt->io_emul_stub[10] = 0xff;
+ ctxt->io_emul_stub[11] = 0xd1;
+ /* data16 or nop */
+ ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+ /* <io-access opcode> */
+ ctxt->io_emul_stub[13] = opcode;
+ /* imm8 or nop */
+ ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+ /* ret (jumps to guest_to_host_gpr_switch) */
+ ctxt->io_emul_stub[15] = 0xc3;
+ BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+ if ( ioemul_handle_quirk )
+ ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+ /* Handy function-typed pointer to the stub. */
+ return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+ unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* INS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ return X86EMUL_DONE;
+ }
+
+ *val = guest_io_read(port, bytes, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+ unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* OUTS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ if ( (bytes == 1) && pv_post_outb_hook )
+ pv_post_outb_hook(port, val);
+ return X86EMUL_DONE;
+ }
+
+ guest_io_write(port, bytes, val, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+ enum x86_segment seg, unsigned long offset,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ ASSERT(seg == x86_seg_es);
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+ {
+ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, x86_seg_es, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(PFEC_write_access,
+ addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+ uint16_t port,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(seg, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+ {
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = 0;
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, seg, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ guest_io_write(port, bytes_per_rep, data, currd);
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct vcpu *curr = current;
+
+ switch ( reg )
+ {
+ case 0: /* Read CR0 */
+ *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+ return X86EMUL_OKAY;
+
+ case 2: /* Read CR2 */
+ case 4: /* Read CR4 */
+ *val = curr->arch.pv_vcpu.ctrlreg[reg];
+ return X86EMUL_OKAY;
+
+ case 3: /* Read CR3 */
+ {
+ const struct domain *currd = curr->domain;
+ unsigned long mfn;
+
+ if ( !is_pv_32bit_domain(currd) )
+ {
+ mfn = pagetable_get_pfn(curr->arch.guest_table);
+ *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+ }
+ else
+ {
+ l4_pgentry_t *pl4e =
+ map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+ mfn = l4e_get_pfn(*pl4e);
+ unmap_domain_page(pl4e);
+ *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+ }
+ /* PTs should not be shared */
+ BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+ return X86EMUL_OKAY;
+ }
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *curr = current;
+
+ switch ( reg )
+ {
+ case 0: /* Write CR0 */
+ if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+ {
+ gdprintk(XENLOG_WARNING,
+ "Attempt to change unmodifiable CR0 flags\n");
+ break;
+ }
+ do_fpu_taskswitch(!!(val & X86_CR0_TS));
+ return X86EMUL_OKAY;
+
+ case 2: /* Write CR2 */
+ curr->arch.pv_vcpu.ctrlreg[2] = val;
+ arch_set_cr2(curr, val);
+ return X86EMUL_OKAY;
+
+ case 3: /* Write CR3 */
+ {
+ struct domain *currd = curr->domain;
+ unsigned long gfn;
+ struct page_info *page;
+ int rc;
+
+ gfn = !is_pv_32bit_domain(currd)
+ ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+ page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+ if ( !page )
+ break;
+ rc = new_guest_cr3(page_to_mfn(page));
+ put_page(page);
+
+ switch ( rc )
+ {
+ case 0:
+ return X86EMUL_OKAY;
+ case -ERESTART: /* retry after preemption */
+ return X86EMUL_RETRY;
+ }
+ break;
+ }
+
+ case 4: /* Write CR4 */
+ curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+ write_cr4(pv_guest_cr4_to_real_cr4(curr));
+ ctxt_switch_levelling(curr);
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long res = do_get_debugreg(reg);
+
+ if ( IS_ERR_VALUE(res) )
+ return X86EMUL_UNHANDLEABLE;
+
+ *val = res;
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ return do_set_debugreg(reg, val) == 0
+ ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
+static inline uint64_t guest_misc_enable(uint64_t val)
+{
+ val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+ MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
+ val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+ return val;
+}
+
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+ return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+ is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ const struct vcpu *curr = current;
+ const struct domain *currd = curr->domain;
+ bool vpmu_msr = false;
+
+ switch ( reg )
+ {
+ int rc;
+
+ case MSR_FS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+ *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+ return X86EMUL_OKAY;
+
+ case MSR_GS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+ *val = cpu_has_fsgsbase ? __rdgsbase()
+ : curr->arch.pv_vcpu.gs_base_kernel;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+ *val = curr->arch.pv_vcpu.gs_base_user;
+ return X86EMUL_OKAY;
+
+ /*
+ * In order to fully retain original behavior, defer calling
+ * pv_soft_rdtsc() until after emulation. This may want/need to be
+ * reconsidered.
+ */
+ case MSR_IA32_TSC:
+ poc->tsc |= TSC_BASE;
+ goto normal;
+
+ case MSR_TSC_AUX:
+ poc->tsc |= TSC_AUX;
+ if ( cpu_has_rdtscp )
+ goto normal;
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_EFER:
+ *val = read_efer();
+ if ( is_pv_32bit_domain(currd) )
+ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+ return X86EMUL_OKAY;
+
+ case MSR_K7_FID_VID_CTL:
+ case MSR_K7_FID_VID_STATUS:
+ case MSR_K8_PSTATE_LIMIT:
+ case MSR_K8_PSTATE_CTRL:
+ case MSR_K8_PSTATE_STATUS:
+ case MSR_K8_PSTATE0:
+ case MSR_K8_PSTATE1:
+ case MSR_K8_PSTATE2:
+ case MSR_K8_PSTATE3:
+ case MSR_K8_PSTATE4:
+ case MSR_K8_PSTATE5:
+ case MSR_K8_PSTATE6:
+ case MSR_K8_PSTATE7:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+ break;
+ if ( unlikely(is_cpufreq_controller(currd)) )
+ goto normal;
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_UCODE_REV:
+ BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ {
+ if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+ break;
+ /* As documented in the SDM: Do a CPUID 1 here */
+ cpuid_eax(1);
+ }
+ goto normal;
+
+ case MSR_IA32_MISC_ENABLE:
+ if ( rdmsr_safe(reg, *val) )
+ break;
+ *val = guest_misc_enable(*val);
+ return X86EMUL_OKAY;
+
+ case MSR_AMD64_DR0_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+ break;
+ *val = curr->arch.pv_vcpu.dr_mask[0];
+ return X86EMUL_OKAY;
+
+ case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+ break;
+ *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_PERF_CAPABILITIES:
+ /* No extra capabilities are supported. */
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_PLATFORM_INFO:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
+ break;
+ *val = 0;
+ if ( this_cpu(cpuid_faulting_enabled) )
+ *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_MISC_FEATURES_ENABLES:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
+ break;
+ *val = 0;
+ if ( curr->arch.cpuid_faulting )
+ *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
+ return X86EMUL_OKAY;
+
+ case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+ case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+ case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+ case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ {
+ vpmu_msr = true;
+ /* fall through */
+ case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+ case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+ if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+ {
+ if ( vpmu_do_rdmsr(reg, val) )
+ break;
+ return X86EMUL_OKAY;
+ }
+ }
+ /* fall through */
+ default:
+ if ( rdmsr_hypervisor_regs(reg, val) )
+ return X86EMUL_OKAY;
+
+ rc = vmce_rdmsr(reg, val);
+ if ( rc < 0 )
+ break;
+ if ( rc )
+ return X86EMUL_OKAY;
+ /* fall through */
+ normal:
+ /* Everyone can read the MSR space. */
+ /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+ if ( rdmsr_safe(reg, *val) )
+ break;
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+#include "../x86_64/mmconfig.h"
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *curr = current;
+ const struct domain *currd = curr->domain;
+ bool vpmu_msr = false;
+
+ switch ( reg )
+ {
+ uint64_t temp;
+ int rc;
+
+ case MSR_FS_BASE:
+ if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+ break;
+ wrfsbase(val);
+ curr->arch.pv_vcpu.fs_base = val;
+ return X86EMUL_OKAY;
+
+ case MSR_GS_BASE:
+ if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+ break;
+ wrgsbase(val);
+ curr->arch.pv_vcpu.gs_base_kernel = val;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+ if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+ break;
+ wrmsrl(MSR_SHADOW_GS_BASE, val);
+ curr->arch.pv_vcpu.gs_base_user = val;
+ return X86EMUL_OKAY;
+
+ case MSR_K7_FID_VID_STATUS:
+ case MSR_K7_FID_VID_CTL:
+ case MSR_K8_PSTATE_LIMIT:
+ case MSR_K8_PSTATE_CTRL:
+ case MSR_K8_PSTATE_STATUS:
+ case MSR_K8_PSTATE0:
+ case MSR_K8_PSTATE1:
+ case MSR_K8_PSTATE2:
+ case MSR_K8_PSTATE3:
+ case MSR_K8_PSTATE4:
+ case MSR_K8_PSTATE5:
+ case MSR_K8_PSTATE6:
+ case MSR_K8_PSTATE7:
+ case MSR_K8_HWCR:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+ break;
+ if ( likely(!is_cpufreq_controller(currd)) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_AMD64_NB_CFG:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+ return X86EMUL_OKAY;
+ if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+ ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+ goto invalid;
+ if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+ return X86EMUL_OKAY;
+ if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+ break;
+ if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+ temp != val :
+ ((temp ^ val) &
+ ~(FAM10H_MMIO_CONF_ENABLE |
+ (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+ FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+ ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+ FAM10H_MMIO_CONF_BASE_SHIFT))) )
+ goto invalid;
+ if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_IA32_UCODE_REV:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+ return X86EMUL_OKAY;
+ if ( rdmsr_safe(reg, temp) )
+ break;
+ if ( val )
+ goto invalid;
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_MISC_ENABLE:
+ if ( rdmsr_safe(reg, temp) )
+ break;
+ if ( val != guest_misc_enable(temp) )
+ goto invalid;
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_MPERF:
+ case MSR_IA32_APERF:
+ if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+ break;
+ if ( likely(!is_cpufreq_controller(currd)) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_IA32_PERF_CTL:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ break;
+ if ( likely(!is_cpufreq_controller(currd)) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_IA32_THERM_CONTROL:
+ case MSR_IA32_ENERGY_PERF_BIAS:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_AMD64_DR0_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+ break;
+ curr->arch.pv_vcpu.dr_mask[0] = val;
+ if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+ wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+ return X86EMUL_OKAY;
+
+ case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+ break;
+ curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+ if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+ wrmsrl(reg, val);
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_PLATFORM_INFO:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
+ break;
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_MISC_FEATURES_ENABLES:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
+ rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
+ break;
+ if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
+ !this_cpu(cpuid_faulting_enabled) )
+ break;
+ curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
+ return X86EMUL_OKAY;
+
+ case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+ case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+ case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+ case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ {
+ vpmu_msr = true;
+ case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+ case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+ if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+ {
+ if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+ !is_hardware_domain(currd) )
+ return X86EMUL_OKAY;
+
+ if ( vpmu_do_wrmsr(reg, val, 0) )
+ break;
+ return X86EMUL_OKAY;
+ }
+ }
+ /* fall through */
+ default:
+ if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+ return X86EMUL_OKAY;
+
+ rc = vmce_wrmsr(reg, val);
+ if ( rc < 0 )
+ break;
+ if ( rc )
+ return X86EMUL_OKAY;
+
+ if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+ invalid:
+ gdprintk(XENLOG_WARNING,
+ "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+ reg, temp, val);
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ /* Ignore the instruction if unprivileged. */
+ if ( !cache_flush_permitted(current->domain) )
+ /*
+ * Non-physdev domain attempted WBINVD; ignore for now since
+ * newer linux uses this in some start-of-day timing loops.
+ */
+ ;
+ else
+ wbinvd();
+
+ return X86EMUL_OKAY;
+}
+
+int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
+ struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
+{
+ guest_cpuid(current, leaf, subleaf, res);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_validate(const struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt)
+{
+ switch ( ctxt->opcode )
+ {
+ case 0x6c ... 0x6f: /* ins / outs */
+ case 0xe4 ... 0xe7: /* in / out (immediate port) */
+ case 0xec ... 0xef: /* in / out (port in %dx) */
+ case X86EMUL_OPC(0x0f, 0x06): /* clts */
+ case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+ case X86EMUL_OPC(0x0f, 0x20) ...
+ X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+ case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+ case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+ case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+ case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+ return X86EMUL_OKAY;
+
+ case 0xfa: case 0xfb: /* cli / sti */
+ if ( !iopl_ok(current, ctxt->regs) )
+ break;
+ /*
+ * This is just too dangerous to allow, in my opinion. Consider if the
+ * caller then tries to reenable interrupts using POPF: we can't trap
+ * that and we'll end up with hard-to-debug lockups. Fast & loose will
+ * do for us. :-)
+ vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
+ */
+ return X86EMUL_DONE;
+
+ case X86EMUL_OPC(0x0f, 0x01):
+ {
+ unsigned int modrm_rm, modrm_reg;
+
+ if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+ (modrm_rm & 7) != 1 )
+ break;
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* xsetbv */
+ case 7: /* rdtscp */
+ return X86EMUL_OKAY;
+ }
+ break;
+ }
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static const struct x86_emulate_ops priv_op_ops = {
+ .insn_fetch = priv_op_insn_fetch,
+ .read = x86emul_unhandleable_rw,
+ .validate = priv_op_validate,
+ .read_io = priv_op_read_io,
+ .write_io = priv_op_write_io,
+ .rep_ins = priv_op_rep_ins,
+ .rep_outs = priv_op_rep_outs,
+ .read_segment = priv_op_read_segment,
+ .read_cr = priv_op_read_cr,
+ .write_cr = priv_op_write_cr,
+ .read_dr = priv_op_read_dr,
+ .write_dr = priv_op_write_dr,
+ .read_msr = priv_op_read_msr,
+ .write_msr = priv_op_write_msr,
+ .cpuid = pv_emul_cpuid,
+ .wbinvd = priv_op_wbinvd,
+};
+
+int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+ struct vcpu *curr = current;
+ struct domain *currd = curr->domain;
+ struct priv_op_ctxt ctxt = {
+ .ctxt.regs = regs,
+ .ctxt.vendor = currd->arch.cpuid->x86_vendor,
+ };
+ int rc;
+ unsigned int eflags, ar;
+
+ if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 1) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ return 0;
+
+ /* Mirror virtualized state into EFLAGS. */
+ ASSERT(regs->eflags & X86_EFLAGS_IF);
+ if ( vcpu_info(curr, evtchn_upcall_mask) )
+ regs->eflags &= ~X86_EFLAGS_IF;
+ else
+ regs->eflags |= X86_EFLAGS_IF;
+ ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
+ regs->eflags |= curr->arch.pv_vcpu.iopl;
+ eflags = regs->eflags;
+
+ ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+ rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
+
+ if ( ctxt.io_emul_stub )
+ unmap_domain_page(ctxt.io_emul_stub);
+
+ /*
+ * Un-mirror virtualized state from EFLAGS.
+ * Nothing we allow to be emulated can change anything other than the
+ * arithmetic bits, and the resume flag.
+ */
+ ASSERT(!((regs->eflags ^ eflags) &
+ ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
+ regs->eflags |= X86_EFLAGS_IF;
+ regs->eflags &= ~X86_EFLAGS_IOPL;
+
+ switch ( rc )
+ {
+ case X86EMUL_OKAY:
+ if ( ctxt.tsc & TSC_BASE )
+ {
+ if ( ctxt.tsc & TSC_AUX )
+ pv_soft_rdtsc(curr, regs, 1);
+ else if ( currd->arch.vtsc )
+ pv_soft_rdtsc(curr, regs, 0);
+ else
+ msr_split(regs, rdtsc());
+ }
+
+ if ( ctxt.ctxt.retire.singlestep )
+ ctxt.bpmatch |= DR_STEP;
+ if ( ctxt.bpmatch )
+ {
+ curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+ if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+ pv_inject_guest_trap(TRAP_debug, regs);
+ }
+ /* fall through */
+ case X86EMUL_RETRY:
+ return EXCRET_fault_fixed;
+
+ case X86EMUL_EXCEPTION:
+ pv_inject_event(&ctxt.ctxt.event);
+ return EXCRET_fault_fixed;
+ }
+
+ return 0;
+}
+
+
+static inline int check_stack_limit(unsigned int ar, unsigned int limit,
+ unsigned int esp, unsigned int decr)
+{
+ return (((esp - decr) < (esp - 1)) &&
+ (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
+}
+
+struct gate_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ bool insn_fetch;
+};
+
+static int gate_op_read(
+ enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct gate_op_ctxt *goc =
+ container_of(ctxt, struct gate_op_ctxt, ctxt);
+ unsigned int rc = bytes, sel = 0;
+ unsigned long addr = offset, limit = 0;
+
+ switch ( seg )
+ {
+ case x86_seg_cs:
+ addr += goc->cs.base;
+ limit = goc->cs.limit;
+ break;
+ case x86_seg_ds:
+ sel = read_sreg(ds);
+ break;
+ case x86_seg_es:
+ sel = read_sreg(es);
+ break;
+ case x86_seg_fs:
+ sel = read_sreg(fs);
+ break;
+ case x86_seg_gs:
+ sel = read_sreg(gs);
+ break;
+ case x86_seg_ss:
+ sel = ctxt->regs->ss;
+ break;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if ( sel )
+ {
+ unsigned int ar;
+
+ ASSERT(!goc->insn_fetch);
+ if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ return X86EMUL_UNHANDLEABLE;
+ addr += offset;
+ }
+ else if ( seg != x86_seg_cs )
+ return X86EMUL_UNHANDLEABLE;
+
+ /* We don't mean to emulate any branches. */
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ return X86EMUL_UNHANDLEABLE;
+
+ addr = (uint32_t)addr;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+ {
+ /*
+ * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+ * cpu_has_nx, but we'd then need a "fetch" variant of
+ * __copy_from_user() respecting NX, SMEP, and protection keys.
+ */
+ x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+void emulate_gate_op(struct cpu_user_regs *regs)
+{
+ struct vcpu *v = current;
+ unsigned int sel, ar, dpl, nparm, insn_len;
+ struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+ struct x86_emulate_state *state;
+ unsigned long off, base, limit;
+ uint16_t opnd_sel = 0;
+ int jump = -1, rc = X86EMUL_OKAY;
+
+ /* Check whether this fault is due to the use of a call gate. */
+ if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
+ (((ar >> 13) & 3) < (regs->cs & 3)) ||
+ ((ar & _SEGMENT_TYPE) != 0xc00) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+ if ( !(ar & _SEGMENT_P) )
+ {
+ pv_inject_guest_trap(TRAP_no_segment, regs);
+ return;
+ }
+ dpl = (ar >> 13) & 3;
+ nparm = ar & 0x1f;
+
+ /*
+ * Decode instruction (and perhaps operand) to determine RPL,
+ * whether this is a jump or a call, and the call return offset.
+ */
+ if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+
+ ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+ state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+ ctxt.insn_fetch = false;
+ if ( IS_ERR_OR_NULL(state) )
+ {
+ if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
+ {
+ ASSERT(ctxt.ctxt.event_pending);
+ pv_inject_event(&ctxt.ctxt.event);
+ }
+ else
+ {
+ ASSERT(!ctxt.ctxt.event_pending);
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ }
+ return;
+ }
+
+ switch ( ctxt.ctxt.opcode )
+ {
+ unsigned int modrm_345;
+
+ case 0xea:
+ ++jump;
+ /* fall through */
+ case 0x9a:
+ ++jump;
+ opnd_sel = x86_insn_immediate(state, 1);
+ break;
+ case 0xff:
+ if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+ break;
+ switch ( modrm_345 & 7 )
+ {
+ enum x86_segment seg;
+
+ case 5:
+ ++jump;
+ /* fall through */
+ case 3:
+ ++jump;
+ base = x86_insn_operand_ea(state, &seg);
+ rc = gate_op_read(seg,
+ base + (x86_insn_opsize(state) >> 3),
+ &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
+ break;
+ }
+ break;
+ }
+
+ insn_len = x86_insn_length(state, &ctxt.ctxt);
+ x86_emulate_free_state(state);
+
+ if ( rc == X86EMUL_EXCEPTION )
+ {
+ ASSERT(ctxt.ctxt.event_pending);
+ pv_inject_event(&ctxt.ctxt.event);
+ return;
+ }
+
+ ASSERT(!ctxt.ctxt.event_pending);
+
+ if ( rc != X86EMUL_OKAY ||
+ jump < 0 ||
+ (opnd_sel & ~3) != regs->error_code ||
+ dpl < (opnd_sel & 3) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+
+ if ( !read_descriptor(sel, v, &base, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_CODE) ||
+ (!jump || (ar & _SEGMENT_EC) ?
+ ((ar >> 13) & 3) > (regs->cs & 3) :
+ ((ar >> 13) & 3) != (regs->cs & 3)) )
+ {
+ pv_inject_hw_exception(TRAP_gp_fault, sel);
+ return;
+ }
+ if ( !(ar & _SEGMENT_P) )
+ {
+ pv_inject_hw_exception(TRAP_no_segment, sel);
+ return;
+ }
+ if ( off > limit )
+ {
+ pv_inject_hw_exception(TRAP_gp_fault, 0);
+ return;
+ }
+
+ if ( !jump )
+ {
+ unsigned int ss, esp, *stkp;
+ int rc;
+#define push(item) do \
+ { \
+ --stkp; \
+ esp -= 4; \
+ rc = __put_user(item, stkp); \
+ if ( rc ) \
+ { \
+ pv_inject_page_fault(PFEC_write_access, \
+ (unsigned long)(stkp + 1) - rc); \
+ return; \
+ } \
+ } while ( 0 )
+
+ if ( ((ar >> 13) & 3) < (regs->cs & 3) )
+ {
+ sel |= (ar >> 13) & 3;
+ /* Inner stack known only for kernel ring. */
+ if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+ esp = v->arch.pv_vcpu.kernel_sp;
+ ss = v->arch.pv_vcpu.kernel_ss;
+ if ( (ss & 3) != (sel & 3) ||
+ !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (sel & 3) ||
+ !(ar & _SEGMENT_S) ||
+ (ar & _SEGMENT_CODE) ||
+ !(ar & _SEGMENT_WR) )
+ {
+ pv_inject_hw_exception(TRAP_invalid_tss, ss & ~3);
+ return;
+ }
+ if ( !(ar & _SEGMENT_P) ||
+ !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
+ {
+ pv_inject_hw_exception(TRAP_stack_error, ss & ~3);
+ return;
+ }
+ stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+ if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+ push(regs->ss);
+ push(regs->rsp);
+ if ( nparm )
+ {
+ const unsigned int *ustkp;
+
+ if ( !read_descriptor(regs->ss, v, &base, &limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (regs->cs & 3) ||
+ !(ar & _SEGMENT_S) ||
+ (ar & _SEGMENT_CODE) ||
+ !(ar & _SEGMENT_WR) ||
+ !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
+ return pv_inject_guest_trap(TRAP_gp_fault, regs);
+ ustkp = (unsigned int *)(unsigned long)
+ ((unsigned int)base + regs->esp + nparm * 4);
+ if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+ do
+ {
+ unsigned int parm;
+
+ --ustkp;
+ rc = __get_user(parm, ustkp);
+ if ( rc )
+ {
+ pv_inject_page_fault(0,
+ (unsigned long)(ustkp + 1) - rc);
+ return;
+ }
+ push(parm);
+ } while ( --nparm );
+ }
+ }
+ else
+ {
+ sel |= (regs->cs & 3);
+ esp = regs->rsp;
+ ss = regs->ss;
+ if ( !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (sel & 3) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+ if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
+ {
+ pv_inject_hw_exception(TRAP_stack_error, 0);
+ return;
+ }
+ stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+ if ( !compat_access_ok(stkp - 2, 2 * 4) )
+ {
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
+ }
+ }
+ push(regs->cs);
+ push(regs->rip + insn_len);
+#undef push
+ regs->rsp = esp;
+ regs->ss = ss;
+ }
+ else
+ sel |= (regs->cs & 3);
+
+ regs->cs = sel;
+ instruction_done(regs, off);
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index a1981289a4..351bb950d8 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -65,7 +65,6 @@
#include <asm/debugger.h>
#include <asm/msr.h>
#include <asm/shared.h>
-#include <asm/x86_emulate.h>
#include <asm/traps.h>
#include <asm/hvm/vpt.h>
#include <asm/hypercall.h>
@@ -544,20 +543,6 @@ static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
return 1;
}
-static const char *trapstr(unsigned int trapnr)
-{
- static const char * const strings[] = {
- "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
- "invalid opcode", "device not available", "double fault",
- "coprocessor segment", "invalid tss", "segment not found",
- "stack error", "general protection fault", "page fault",
- "spurious interrupt", "coprocessor error", "alignment check",
- "machine check", "simd error", "virtualisation exception"
- };
-
- return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
-}
-
/*
* This is called for faults at very unexpected times (e.g., when interrupts
* are disabled). In such situations we can't do much that is safe. We try to
@@ -625,138 +610,6 @@ void fatal_trap(const struct cpu_user_regs *regs, bool_t show_remote)
(regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
}
-void pv_inject_event(const struct x86_event *event)
-{
- struct vcpu *v = current;
- struct cpu_user_regs *regs = guest_cpu_user_regs();
- struct trap_bounce *tb;
- const struct trap_info *ti;
- const uint8_t vector = event->vector;
- const bool use_error_code =
- ((vector < 32) && (TRAP_HAVE_EC & (1u << vector)));
- unsigned int error_code = event->error_code;
-
- ASSERT(vector == event->vector); /* Confirm no truncation. */
- if ( use_error_code )
- ASSERT(error_code != X86_EVENT_NO_EC);
- else
- ASSERT(error_code == X86_EVENT_NO_EC);
-
- tb = &v->arch.pv_vcpu.trap_bounce;
- ti = &v->arch.pv_vcpu.trap_ctxt[vector];
-
- tb->flags = TBF_EXCEPTION;
- tb->cs = ti->cs;
- tb->eip = ti->address;
-
- if ( vector == TRAP_page_fault )
- {
- v->arch.pv_vcpu.ctrlreg[2] = event->cr2;
- arch_set_cr2(v, event->cr2);
-
- /* Re-set error_code.user flag appropriately for the guest. */
- error_code &= ~PFEC_user_mode;
- if ( !guest_kernel_mode(v, regs) )
- error_code |= PFEC_user_mode;
-
- trace_pv_page_fault(event->cr2, error_code);
- }
- else
- trace_pv_trap(vector, regs->rip, use_error_code, error_code);
-
- if ( use_error_code )
- {
- tb->flags |= TBF_EXCEPTION_ERRCODE;
- tb->error_code = error_code;
- }
-
- if ( TI_GET_IF(ti) )
- tb->flags |= TBF_INTERRUPT;
-
- if ( unlikely(null_trap_bounce(v, tb)) )
- {
- gprintk(XENLOG_WARNING,
- "Unhandled %s fault/trap [#%d, ec=%04x]\n",
- trapstr(vector), vector, error_code);
-
- if ( vector == TRAP_page_fault )
- show_page_walk(event->cr2);
- }
-}
-
-static void instruction_done(struct cpu_user_regs *regs, unsigned long rip)
-{
- regs->rip = rip;
- regs->eflags &= ~X86_EFLAGS_RF;
- if ( regs->eflags & X86_EFLAGS_TF )
- {
- current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
- pv_inject_guest_trap(TRAP_debug, regs);
- }
-}
-
-static unsigned int check_guest_io_breakpoint(struct vcpu *v,
- unsigned int port, unsigned int len)
-{
- unsigned int width, i, match = 0;
- unsigned long start;
-
- if ( !(v->arch.debugreg[5]) ||
- !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
- return 0;
-
- for ( i = 0; i < 4; i++ )
- {
- if ( !(v->arch.debugreg[5] &
- (3 << (i * DR_ENABLE_SIZE))) )
- continue;
-
- start = v->arch.debugreg[i];
- width = 0;
-
- switch ( (v->arch.debugreg[7] >>
- (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
- {
- case DR_LEN_1: width = 1; break;
- case DR_LEN_2: width = 2; break;
- case DR_LEN_4: width = 4; break;
- case DR_LEN_8: width = 8; break;
- }
-
- if ( (start < (port + len)) && ((start + width) > port) )
- match |= 1 << i;
- }
-
- return match;
-}
-
-/*
- * Called from asm to set up the MCE trapbounce info.
- * Returns 0 if no callback is set up, else 1.
- */
-int set_guest_machinecheck_trapbounce(void)
-{
- struct vcpu *v = current;
- struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
-
- pv_inject_guest_trap(TRAP_machine_check, guest_cpu_user_regs());
- tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
- return !null_trap_bounce(v, tb);
-}
-
-/*
- * Called from asm to set up the NMI trapbounce info.
- * Returns 0 if no callback is set up, else 1.
- */
-int set_guest_nmi_trapbounce(void)
-{
- struct vcpu *v = current;
- struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
- pv_inject_guest_trap(TRAP_nmi, guest_cpu_user_regs());
- tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
- return !null_trap_bounce(v, tb);
-}
-
void do_reserved_trap(struct cpu_user_regs *regs)
{
unsigned int trapnr = regs->entry_vector;
@@ -997,77 +850,6 @@ void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
}
}
-static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
-{
- char opcode[3];
- unsigned long eip, rc;
- struct vcpu *v = current;
-
- eip = regs->rip;
- if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
- {
- pv_inject_page_fault(0, eip + sizeof(opcode) - rc);
- return EXCRET_fault_fixed;
- }
- if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
- return 0;
- eip += sizeof(opcode);
- pv_soft_rdtsc(v, regs, 1);
- instruction_done(regs, eip);
- return EXCRET_fault_fixed;
-}
-
-static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
-{
- char sig[5], instr[2];
- unsigned long eip, rc;
- struct cpuid_leaf res;
-
- eip = regs->rip;
-
- /* Check for forced emulation signature: ud2 ; .ascii "xen". */
- if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
- {
- pv_inject_page_fault(0, eip + sizeof(sig) - rc);
- return EXCRET_fault_fixed;
- }
- if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
- return 0;
- eip += sizeof(sig);
-
- /* We only emulate CPUID. */
- if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
- {
- pv_inject_page_fault(0, eip + sizeof(instr) - rc);
- return EXCRET_fault_fixed;
- }
- if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
- return 0;
-
- /* If cpuid faulting is enabled and CPL>0 inject a #GP in place of #UD. */
- if ( current->arch.cpuid_faulting && !guest_kernel_mode(current, regs) )
- {
- regs->rip = eip;
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return EXCRET_fault_fixed;
- }
-
- eip += sizeof(instr);
-
- guest_cpuid(current, regs->eax, regs->ecx, &res);
-
- regs->rax = res.a;
- regs->rbx = res.b;
- regs->rcx = res.c;
- regs->rdx = res.d;
-
- instruction_done(regs, eip);
-
- trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->rip);
-
- return EXCRET_fault_fixed;
-}
-
void do_invalid_op(struct cpu_user_regs *regs)
{
const struct bug_frame *bug = NULL;
@@ -1576,1963 +1358,173 @@ void __init do_early_page_fault(struct cpu_user_regs *regs)
}
}
-long do_fpu_taskswitch(int set)
+void do_general_protection(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
+ unsigned long fixup;
- if ( set )
- {
- v->arch.pv_vcpu.ctrlreg[0] |= X86_CR0_TS;
- stts();
- }
- else
- {
- v->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
- if ( v->fpu_dirtied )
- clts();
- }
+ if ( debugger_trap_entry(TRAP_gp_fault, regs) )
+ return;
- return 0;
-}
+ if ( regs->error_code & X86_XEC_EXT )
+ goto hardware_gp;
-static int read_descriptor(unsigned int sel,
- const struct vcpu *v,
- unsigned long *base,
- unsigned long *limit,
- unsigned int *ar,
- bool_t insn_fetch)
-{
- struct desc_struct desc;
-
- if ( sel < 4)
- desc.b = desc.a = 0;
- else if ( __get_user(desc,
- (const struct desc_struct *)(!(sel & 4)
- ? GDT_VIRT_START(v)
- : LDT_VIRT_START(v))
- + (sel >> 3)) )
- return 0;
- if ( !insn_fetch )
- desc.b &= ~_SEGMENT_L;
+ if ( !guest_mode(regs) )
+ goto gp_in_kernel;
- *ar = desc.b & 0x00f0ff00;
- if ( !(desc.b & _SEGMENT_L) )
+ /*
+ * Cunning trick to allow arbitrary "INT n" handling.
+ *
+ * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
+ * instruction from trapping to the appropriate vector, when that might not
+ * be expected by Xen or the guest OS. For example, that entry might be for
+ * a fault handler (unlike traps, faults don't increment EIP), or might
+ * expect an error code on the stack (which a software trap never
+ * provides), or might be a hardware interrupt handler that doesn't like
+ * being called spuriously.
+ *
+ * Instead, a GPF occurs with the faulting IDT vector in the error code.
+ * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
+ * clear (which got already checked above) to indicate that it's a software
+ * fault, not a hardware one.
+ *
+ * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
+ * okay because they can only be triggered by an explicit DPL-checked
+ * instruction. The DPL specified by the guest OS for these vectors is NOT
+ * CHECKED!!
+ */
+ if ( regs->error_code & X86_XEC_IDT )
{
- *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
- (desc.b & 0xff000000));
- *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
- if ( desc.b & _SEGMENT_G )
- *limit = ((*limit + 1) << 12) - 1;
-#ifndef NDEBUG
- if ( sel > 3 )
+ /* This fault must be due to <INT n> instruction. */
+ const struct trap_info *ti;
+ unsigned char vector = regs->error_code >> 3;
+ ti = &v->arch.pv_vcpu.trap_ctxt[vector];
+ if ( permit_softint(TI_GET_DPL(ti), v, regs) )
{
- unsigned int a, l;
- unsigned char valid;
-
- asm volatile (
- "larl %2,%0 ; setz %1"
- : "=r" (a), "=qm" (valid) : "rm" (sel));
- BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
- asm volatile (
- "lsll %2,%0 ; setz %1"
- : "=r" (l), "=qm" (valid) : "rm" (sel));
- BUG_ON(valid && (l != *limit));
+ regs->rip += 2;
+ pv_inject_guest_trap(vector, regs);
+ return;
}
-#endif
}
- else
+ else if ( is_pv_32bit_vcpu(v) && regs->error_code )
{
- *base = 0UL;
- *limit = ~0UL;
+ emulate_gate_op(regs);
+ return;
}
- return 1;
-}
-
-static int read_gate_descriptor(unsigned int gate_sel,
- const struct vcpu *v,
- unsigned int *sel,
- unsigned long *off,
- unsigned int *ar)
-{
- struct desc_struct desc;
- const struct desc_struct *pdesc;
-
-
- pdesc = (const struct desc_struct *)
- (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
- + (gate_sel >> 3);
- if ( (gate_sel < 4) ||
- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
- __get_user(desc, pdesc) )
- return 0;
+ /* Emulate some simple privileged and I/O instructions. */
+ if ( (regs->error_code == 0) &&
+ emulate_privileged_op(regs) )
+ {
+ trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
+ return;
+ }
- *sel = (desc.a >> 16) & 0x0000fffc;
- *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
- *ar = desc.b & 0x0000ffff;
+ /* Pass on GPF as is. */
+ pv_inject_guest_trap(TRAP_gp_fault, regs);
+ return;
- /*
- * check_descriptor() clears the DPL field and stores the
- * guest requested DPL in the selector's RPL field.
- */
- if ( *ar & _SEGMENT_DPL )
- return 0;
- *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
+ gp_in_kernel:
- if ( !is_pv_32bit_vcpu(v) )
+ if ( likely((fixup = search_exception_table(regs)) != 0) )
{
- if ( (*ar & 0x1f00) != 0x0c00 ||
- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
- __get_user(desc, pdesc + 1) ||
- (desc.b & 0x1f00) )
- return 0;
-
- *off |= (unsigned long)desc.a << 32;
- return 1;
+ dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
+ regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
+ this_cpu(last_extable_addr) = regs->rip;
+ regs->rip = fixup;
+ return;
}
- switch ( *ar & 0x1f00 )
- {
- case 0x0400:
- *off &= 0xffff;
- break;
- case 0x0c00:
- break;
- default:
- return 0;
- }
+ hardware_gp:
+ if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
+ return;
- return 1;
+ show_execution_state(regs);
+ panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
}
-static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
- unsigned int bytes, unsigned long limit,
- enum x86_segment seg,
- struct x86_emulate_ctxt *ctxt,
- unsigned long *addr)
+static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
+
+static void nmi_mce_softirq(void)
{
- int rc = X86EMUL_OKAY;
+ int cpu = smp_processor_id();
+ struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
- *addr = base + offset;
+ BUG_ON(st->vcpu == NULL);
+
+ /* Set the tmp value unconditionally, so that
+ * the check in the iret hypercall works. */
+ cpumask_copy(st->vcpu->cpu_hard_affinity_tmp,
+ st->vcpu->cpu_hard_affinity);
- if ( ctxt->addr_size < 64 )
+ if ((cpu != st->processor)
+ || (st->processor != st->vcpu->processor))
{
- if ( limit < bytes - 1 || offset > limit - bytes + 1 )
- rc = X86EMUL_EXCEPTION;
- *addr = (uint32_t)*addr;
- }
- else if ( !__addr_ok(*addr) )
- rc = X86EMUL_EXCEPTION;
+ /* We are on a different physical cpu.
+ * Make sure to wakeup the vcpu on the
+ * specified processor.
+ */
+ vcpu_set_hard_affinity(st->vcpu, cpumask_of(st->processor));
- if ( unlikely(rc == X86EMUL_EXCEPTION) )
- x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
- : TRAP_stack_error,
- 0, ctxt);
+ /* Affinity is restored in the iret hypercall. */
+ }
- return rc;
+ /* Only used to defer wakeup of domain/vcpu to
+ * a safe (non-NMI/MCE) context.
+ */
+ vcpu_kick(st->vcpu);
+ st->vcpu = NULL;
}
-struct priv_op_ctxt {
- struct x86_emulate_ctxt ctxt;
- struct {
- unsigned long base, limit;
- } cs;
- char *io_emul_stub;
- unsigned int bpmatch;
- unsigned int tsc;
-#define TSC_BASE 1
-#define TSC_AUX 2
-};
-
-static int priv_op_insn_fetch(enum x86_segment seg,
- unsigned long offset,
- void *p_data,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
+static void pci_serr_softirq(void)
{
- const struct priv_op_ctxt *poc =
- container_of(ctxt, struct priv_op_ctxt, ctxt);
- unsigned int rc;
- unsigned long addr = poc->cs.base + offset;
+ printk("\n\nNMI - PCI system error (SERR)\n");
+ outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
+}
- ASSERT(seg == x86_seg_cs);
+void async_exception_cleanup(struct vcpu *curr)
+{
+ int trap;
- /* We don't mean to emulate any branches. */
- if ( !bytes )
- return X86EMUL_UNHANDLEABLE;
+ if ( !curr->async_exception_mask )
+ return;
- rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
- x86_seg_cs, ctxt, &addr);
- if ( rc != X86EMUL_OKAY )
- return rc;
+ /* Restore affinity. */
+ if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
+ !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
+ {
+ vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
+ cpumask_clear(curr->cpu_hard_affinity_tmp);
+ }
- if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+ if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
+ trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
+ else
+ for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
+ if ( (curr->async_exception_mask ^
+ curr->async_exception_state(trap).old_mask) == (1 << trap) )
+ break;
+ if ( unlikely(trap > VCPU_TRAP_LAST) )
{
- /*
- * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
- * cpu_has_nx, but we'd then need a "fetch" variant of
- * __copy_from_user() respecting NX, SMEP, and protection keys.
- */
- x86_emul_pagefault(0, addr + bytes - rc, ctxt);
- return X86EMUL_EXCEPTION;
+ ASSERT_UNREACHABLE();
+ return;
}
- return X86EMUL_OKAY;
+ /* Restore previous asynchronous exception mask. */
+ curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
}
-static int priv_op_read_segment(enum x86_segment seg,
- struct segment_register *reg,
- struct x86_emulate_ctxt *ctxt)
+static void nmi_hwdom_report(unsigned int reason_idx)
{
- /* Check if this is an attempt to access the I/O bitmap. */
- if ( seg == x86_seg_tr )
- {
- switch ( ctxt->opcode )
- {
- case 0x6c ... 0x6f: /* ins / outs */
- case 0xe4 ... 0xe7: /* in / out (immediate port) */
- case 0xec ... 0xef: /* in / out (port in %dx) */
- /* Defer the check to priv_op_{read,write}_io(). */
- return X86EMUL_DONE;
- }
- }
+ struct domain *d = hardware_domain;
- if ( ctxt->addr_size < 64 )
- {
- unsigned long limit;
- unsigned int sel, ar;
+ if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
+ return;
- switch ( seg )
- {
- case x86_seg_cs: sel = ctxt->regs->cs; break;
- case x86_seg_ds: sel = read_sreg(ds); break;
- case x86_seg_es: sel = read_sreg(es); break;
- case x86_seg_fs: sel = read_sreg(fs); break;
- case x86_seg_gs: sel = read_sreg(gs); break;
- case x86_seg_ss: sel = ctxt->regs->ss; break;
- default: return X86EMUL_UNHANDLEABLE;
- }
+ set_bit(reason_idx, nmi_reason(d));
- if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) )
- return X86EMUL_UNHANDLEABLE;
-
- reg->limit = limit;
- reg->attr.bytes = ar >> 8;
- }
- else
- {
- switch ( seg )
- {
- default:
- if ( !is_x86_user_segment(seg) )
- return X86EMUL_UNHANDLEABLE;
- reg->base = 0;
- break;
- case x86_seg_fs:
- reg->base = rdfsbase();
- break;
- case x86_seg_gs:
- reg->base = rdgsbase();
- break;
- }
-
- reg->limit = ~0U;
-
- reg->attr.bytes = 0;
- reg->attr.fields.type = _SEGMENT_WR >> 8;
- if ( seg == x86_seg_cs )
- {
- reg->attr.fields.type |= _SEGMENT_CODE >> 8;
- reg->attr.fields.l = 1;
- }
- else
- reg->attr.fields.db = 1;
- reg->attr.fields.s = 1;
- reg->attr.fields.dpl = 3;
- reg->attr.fields.p = 1;
- reg->attr.fields.g = 1;
- }
-
- /*
- * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
- * Also do this for consistency for non-conforming code segments.
- */
- if ( (seg == x86_seg_ss ||
- (seg == x86_seg_cs &&
- !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
- guest_kernel_mode(current, ctxt->regs) )
- reg->attr.fields.dpl = 0;
-
- return X86EMUL_OKAY;
-}
-
-/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
-static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
-{
- unsigned int cpl = guest_kernel_mode(v, regs) ?
- (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
-
- ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
-
- return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
-}
-
-/* Has the guest requested sufficient permission for this I/O access? */
-static int guest_io_okay(
- unsigned int port, unsigned int bytes,
- struct vcpu *v, struct cpu_user_regs *regs)
-{
- /* If in user mode, switch to kernel mode just to read I/O bitmap. */
- int user_mode = !(v->arch.flags & TF_kernel_mode);
-#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
-
- if ( iopl_ok(v, regs) )
- return 1;
-
- if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
- {
- union { uint8_t bytes[2]; uint16_t mask; } x;
-
- /*
- * Grab permission bytes from guest space. Inaccessible bytes are
- * read as 0xff (no access allowed).
- */
- TOGGLE_MODE();
- switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
- port>>3, 2) )
- {
- default: x.bytes[0] = ~0;
- /* fallthrough */
- case 1: x.bytes[1] = ~0;
- /* fallthrough */
- case 0: break;
- }
- TOGGLE_MODE();
-
- if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
- return 1;
- }
-
- return 0;
-}
-
-/* Has the administrator granted sufficient permission for this I/O access? */
-static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
- const struct domain *d)
-{
- /*
- * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
- * We never permit direct access to that register.
- */
- if ( (port == 0xcf8) && (bytes == 4) )
- return 0;
-
- /* We also never permit direct access to the RTC/CMOS registers. */
- if ( ((port & ~1) == RTC_PORT(0)) )
- return 0;
-
- return ioports_access_permitted(d, port, port + bytes - 1);
-}
-
-static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
- unsigned int size, uint32_t *write)
-{
- uint32_t machine_bdf;
-
- if ( !is_hardware_domain(currd) )
- return 0;
-
- if ( !CF8_ENABLED(currd->arch.pci_cf8) )
- return 1;
-
- machine_bdf = CF8_BDF(currd->arch.pci_cf8);
- if ( write )
- {
- const unsigned long *ro_map = pci_get_ro_map(0);
-
- if ( ro_map && test_bit(machine_bdf, ro_map) )
- return 0;
- }
- start |= CF8_ADDR_LO(currd->arch.pci_cf8);
- /* AMD extended configuration space access? */
- if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
- boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
- {
- uint64_t msr_val;
-
- if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
- return 0;
- if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
- start |= CF8_ADDR_HI(currd->arch.pci_cf8);
- }
-
- return !write ?
- xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
- start, start + size - 1, 0) == 0 :
- pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
-}
-
-uint32_t guest_io_read(unsigned int port, unsigned int bytes,
- struct domain *currd)
-{
- uint32_t data = 0;
- unsigned int shift = 0;
-
- if ( admin_io_okay(port, bytes, currd) )
- {
- switch ( bytes )
- {
- case 1: return inb(port);
- case 2: return inw(port);
- case 4: return inl(port);
- }
- }
-
- while ( bytes != 0 )
- {
- unsigned int size = 1;
- uint32_t sub_data = ~0;
-
- if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
- {
- sub_data = pv_pit_handler(port, 0, 0);
- }
- else if ( port == RTC_PORT(0) )
- {
- sub_data = currd->arch.cmos_idx;
- }
- else if ( (port == RTC_PORT(1)) &&
- ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
- {
- unsigned long flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
- outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
- sub_data = inb(RTC_PORT(1));
- spin_unlock_irqrestore(&rtc_lock, flags);
- }
- else if ( (port == 0xcf8) && (bytes == 4) )
- {
- size = 4;
- sub_data = currd->arch.pci_cf8;
- }
- else if ( (port & 0xfffc) == 0xcfc )
- {
- size = min(bytes, 4 - (port & 3));
- if ( size == 3 )
- size = 2;
- if ( pci_cfg_ok(currd, port & 3, size, NULL) )
- sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
- }
-
- if ( size == 4 )
- return sub_data;
-
- data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
- shift += size * 8;
- port += size;
- bytes -= size;
- }
-
- return data;
-}
-
-void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
- struct domain *currd)
-{
- if ( admin_io_okay(port, bytes, currd) )
- {
- switch ( bytes ) {
- case 1:
- outb((uint8_t)data, port);
- if ( pv_post_outb_hook )
- pv_post_outb_hook(port, (uint8_t)data);
- break;
- case 2:
- outw((uint16_t)data, port);
- break;
- case 4:
- outl(data, port);
- break;
- }
- return;
- }
-
- while ( bytes != 0 )
- {
- unsigned int size = 1;
-
- if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
- {
- pv_pit_handler(port, (uint8_t)data, 1);
- }
- else if ( port == RTC_PORT(0) )
- {
- currd->arch.cmos_idx = data;
- }
- else if ( (port == RTC_PORT(1)) &&
- ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
- {
- unsigned long flags;
-
- if ( pv_rtc_handler )
- pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
- spin_lock_irqsave(&rtc_lock, flags);
- outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
- outb(data, RTC_PORT(1));
- spin_unlock_irqrestore(&rtc_lock, flags);
- }
- else if ( (port == 0xcf8) && (bytes == 4) )
- {
- size = 4;
- currd->arch.pci_cf8 = data;
- }
- else if ( (port & 0xfffc) == 0xcfc )
- {
- size = min(bytes, 4 - (port & 3));
- if ( size == 3 )
- size = 2;
- if ( pci_cfg_ok(currd, port & 3, size, &data) )
- pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
- }
-
- if ( size == 4 )
- return;
-
- port += size;
- bytes -= size;
- data >>= size * 8;
- }
-}
-
-/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
-void host_to_guest_gpr_switch(struct cpu_user_regs *);
-unsigned long guest_to_host_gpr_switch(unsigned long);
-
-void (*pv_post_outb_hook)(unsigned int port, u8 value);
-
-typedef void io_emul_stub_t(struct cpu_user_regs *);
-
-static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
- unsigned int port, unsigned int bytes)
-{
- if ( !ctxt->io_emul_stub )
- ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
- (this_cpu(stubs.addr) &
- ~PAGE_MASK) +
- STUB_BUF_SIZE / 2;
-
- /* movq $host_to_guest_gpr_switch,%rcx */
- ctxt->io_emul_stub[0] = 0x48;
- ctxt->io_emul_stub[1] = 0xb9;
- *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
- /* callq *%rcx */
- ctxt->io_emul_stub[10] = 0xff;
- ctxt->io_emul_stub[11] = 0xd1;
- /* data16 or nop */
- ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
- /* <io-access opcode> */
- ctxt->io_emul_stub[13] = opcode;
- /* imm8 or nop */
- ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
- /* ret (jumps to guest_to_host_gpr_switch) */
- ctxt->io_emul_stub[15] = 0xc3;
- BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
- if ( ioemul_handle_quirk )
- ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
-
- /* Handy function-typed pointer to the stub. */
- return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-}
-
-static int priv_op_read_io(unsigned int port, unsigned int bytes,
- unsigned long *val, struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
-
- /* INS must not come here. */
- ASSERT((ctxt->opcode & ~9) == 0xe4);
-
- if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
- if ( admin_io_okay(port, bytes, currd) )
- {
- io_emul_stub_t *io_emul =
- io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
- mark_regs_dirty(ctxt->regs);
- io_emul(ctxt->regs);
- return X86EMUL_DONE;
- }
-
- *val = guest_io_read(port, bytes, currd);
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_write_io(unsigned int port, unsigned int bytes,
- unsigned long val, struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
-
- /* OUTS must not come here. */
- ASSERT((ctxt->opcode & ~9) == 0xe6);
-
- if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
- if ( admin_io_okay(port, bytes, currd) )
- {
- io_emul_stub_t *io_emul =
- io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
- mark_regs_dirty(ctxt->regs);
- io_emul(ctxt->regs);
- if ( (bytes == 1) && pv_post_outb_hook )
- pv_post_outb_hook(port, val);
- return X86EMUL_DONE;
- }
-
- guest_io_write(port, bytes, val, currd);
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_ins(uint16_t port,
- enum x86_segment seg, unsigned long offset,
- unsigned int bytes_per_rep, unsigned long *reps,
- struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
- unsigned long goal = *reps;
- struct segment_register sreg;
- int rc;
-
- ASSERT(seg == x86_seg_es);
-
- *reps = 0;
-
- if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( !sreg.attr.fields.p )
- return X86EMUL_UNHANDLEABLE;
- if ( !sreg.attr.fields.s ||
- (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
- !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
- {
- x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
- while ( *reps < goal )
- {
- unsigned int data = guest_io_read(port, bytes_per_rep, currd);
- unsigned long addr;
-
- rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
- sreg.limit, x86_seg_es, ctxt, &addr);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
- {
- x86_emul_pagefault(PFEC_write_access,
- addr + bytes_per_rep - rc, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- ++*reps;
-
- if ( poc->bpmatch || hypercall_preempt_check() )
- break;
-
- /* x86_emulate() clips the repetition count to ensure we don't wrap. */
- if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
- offset -= bytes_per_rep;
- else
- offset += bytes_per_rep;
- }
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
- uint16_t port,
- unsigned int bytes_per_rep, unsigned long *reps,
- struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
- unsigned long goal = *reps;
- struct segment_register sreg;
- int rc;
-
- *reps = 0;
-
- if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- rc = priv_op_read_segment(seg, &sreg, ctxt);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( !sreg.attr.fields.p )
- return X86EMUL_UNHANDLEABLE;
- if ( !sreg.attr.fields.s ||
- ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
- !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
- {
- x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
- : TRAP_stack_error,
- 0, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
- while ( *reps < goal )
- {
- unsigned int data = 0;
- unsigned long addr;
-
- rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
- sreg.limit, seg, ctxt, &addr);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
- {
- x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- guest_io_write(port, bytes_per_rep, data, currd);
-
- ++*reps;
-
- if ( poc->bpmatch || hypercall_preempt_check() )
- break;
-
- /* x86_emulate() clips the repetition count to ensure we don't wrap. */
- if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
- offset -= bytes_per_rep;
- else
- offset += bytes_per_rep;
- }
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_read_cr(unsigned int reg, unsigned long *val,
- struct x86_emulate_ctxt *ctxt)
-{
- const struct vcpu *curr = current;
-
- switch ( reg )
- {
- case 0: /* Read CR0 */
- *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
- return X86EMUL_OKAY;
-
- case 2: /* Read CR2 */
- case 4: /* Read CR4 */
- *val = curr->arch.pv_vcpu.ctrlreg[reg];
- return X86EMUL_OKAY;
-
- case 3: /* Read CR3 */
- {
- const struct domain *currd = curr->domain;
- unsigned long mfn;
-
- if ( !is_pv_32bit_domain(currd) )
- {
- mfn = pagetable_get_pfn(curr->arch.guest_table);
- *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
- }
- else
- {
- l4_pgentry_t *pl4e =
- map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
-
- mfn = l4e_get_pfn(*pl4e);
- unmap_domain_page(pl4e);
- *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
- }
- /* PTs should not be shared */
- BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
- return X86EMUL_OKAY;
- }
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_write_cr(unsigned int reg, unsigned long val,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *curr = current;
-
- switch ( reg )
- {
- case 0: /* Write CR0 */
- if ( (val ^ read_cr0()) & ~X86_CR0_TS )
- {
- gdprintk(XENLOG_WARNING,
- "Attempt to change unmodifiable CR0 flags\n");
- break;
- }
- do_fpu_taskswitch(!!(val & X86_CR0_TS));
- return X86EMUL_OKAY;
-
- case 2: /* Write CR2 */
- curr->arch.pv_vcpu.ctrlreg[2] = val;
- arch_set_cr2(curr, val);
- return X86EMUL_OKAY;
-
- case 3: /* Write CR3 */
- {
- struct domain *currd = curr->domain;
- unsigned long gfn;
- struct page_info *page;
- int rc;
-
- gfn = !is_pv_32bit_domain(currd)
- ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
- page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
- if ( !page )
- break;
- rc = new_guest_cr3(page_to_mfn(page));
- put_page(page);
-
- switch ( rc )
- {
- case 0:
- return X86EMUL_OKAY;
- case -ERESTART: /* retry after preemption */
- return X86EMUL_RETRY;
- }
- break;
- }
-
- case 4: /* Write CR4 */
- curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
- write_cr4(pv_guest_cr4_to_real_cr4(curr));
- ctxt_switch_levelling(curr);
- return X86EMUL_OKAY;
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_read_dr(unsigned int reg, unsigned long *val,
- struct x86_emulate_ctxt *ctxt)
-{
- unsigned long res = do_get_debugreg(reg);
-
- if ( IS_ERR_VALUE(res) )
- return X86EMUL_UNHANDLEABLE;
-
- *val = res;
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_write_dr(unsigned int reg, unsigned long val,
- struct x86_emulate_ctxt *ctxt)
-{
- return do_set_debugreg(reg, val) == 0
- ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
-}
-
-static inline uint64_t guest_misc_enable(uint64_t val)
-{
- val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
- MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
- val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
- MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
- MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
- return val;
-}
-
-static inline bool is_cpufreq_controller(const struct domain *d)
-{
- return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
- is_hardware_domain(d));
-}
-
-static int priv_op_read_msr(unsigned int reg, uint64_t *val,
- struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- const struct vcpu *curr = current;
- const struct domain *currd = curr->domain;
- bool vpmu_msr = false;
-
- switch ( reg )
- {
- int rc;
-
- case MSR_FS_BASE:
- if ( is_pv_32bit_domain(currd) )
- break;
- *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
- return X86EMUL_OKAY;
-
- case MSR_GS_BASE:
- if ( is_pv_32bit_domain(currd) )
- break;
- *val = cpu_has_fsgsbase ? __rdgsbase()
- : curr->arch.pv_vcpu.gs_base_kernel;
- return X86EMUL_OKAY;
-
- case MSR_SHADOW_GS_BASE:
- if ( is_pv_32bit_domain(currd) )
- break;
- *val = curr->arch.pv_vcpu.gs_base_user;
- return X86EMUL_OKAY;
-
- /*
- * In order to fully retain original behavior, defer calling
- * pv_soft_rdtsc() until after emulation. This may want/need to be
- * reconsidered.
- */
- case MSR_IA32_TSC:
- poc->tsc |= TSC_BASE;
- goto normal;
-
- case MSR_TSC_AUX:
- poc->tsc |= TSC_AUX;
- if ( cpu_has_rdtscp )
- goto normal;
- *val = 0;
- return X86EMUL_OKAY;
-
- case MSR_EFER:
- *val = read_efer();
- if ( is_pv_32bit_domain(currd) )
- *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
- return X86EMUL_OKAY;
-
- case MSR_K7_FID_VID_CTL:
- case MSR_K7_FID_VID_STATUS:
- case MSR_K8_PSTATE_LIMIT:
- case MSR_K8_PSTATE_CTRL:
- case MSR_K8_PSTATE_STATUS:
- case MSR_K8_PSTATE0:
- case MSR_K8_PSTATE1:
- case MSR_K8_PSTATE2:
- case MSR_K8_PSTATE3:
- case MSR_K8_PSTATE4:
- case MSR_K8_PSTATE5:
- case MSR_K8_PSTATE6:
- case MSR_K8_PSTATE7:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
- break;
- if ( unlikely(is_cpufreq_controller(currd)) )
- goto normal;
- *val = 0;
- return X86EMUL_OKAY;
-
- case MSR_IA32_UCODE_REV:
- BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
- {
- if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
- break;
- /* As documented in the SDM: Do a CPUID 1 here */
- cpuid_eax(1);
- }
- goto normal;
-
- case MSR_IA32_MISC_ENABLE:
- if ( rdmsr_safe(reg, *val) )
- break;
- *val = guest_misc_enable(*val);
- return X86EMUL_OKAY;
-
- case MSR_AMD64_DR0_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
- break;
- *val = curr->arch.pv_vcpu.dr_mask[0];
- return X86EMUL_OKAY;
-
- case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
- break;
- *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
- return X86EMUL_OKAY;
-
- case MSR_IA32_PERF_CAPABILITIES:
- /* No extra capabilities are supported. */
- *val = 0;
- return X86EMUL_OKAY;
-
- case MSR_INTEL_PLATFORM_INFO:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
- break;
- *val = 0;
- if ( this_cpu(cpuid_faulting_enabled) )
- *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
- return X86EMUL_OKAY;
-
- case MSR_INTEL_MISC_FEATURES_ENABLES:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
- break;
- *val = 0;
- if ( curr->arch.cpuid_faulting )
- *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
- return X86EMUL_OKAY;
-
- case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
- case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
- case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
- case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
- {
- vpmu_msr = true;
- /* fall through */
- case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
- case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
- if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
- {
- if ( vpmu_do_rdmsr(reg, val) )
- break;
- return X86EMUL_OKAY;
- }
- }
- /* fall through */
- default:
- if ( rdmsr_hypervisor_regs(reg, val) )
- return X86EMUL_OKAY;
-
- rc = vmce_rdmsr(reg, val);
- if ( rc < 0 )
- break;
- if ( rc )
- return X86EMUL_OKAY;
- /* fall through */
- normal:
- /* Everyone can read the MSR space. */
- /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
- if ( rdmsr_safe(reg, *val) )
- break;
- return X86EMUL_OKAY;
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-#include "x86_64/mmconfig.h"
-
-static int priv_op_write_msr(unsigned int reg, uint64_t val,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *curr = current;
- const struct domain *currd = curr->domain;
- bool vpmu_msr = false;
-
- switch ( reg )
- {
- uint64_t temp;
- int rc;
-
- case MSR_FS_BASE:
- if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
- break;
- wrfsbase(val);
- curr->arch.pv_vcpu.fs_base = val;
- return X86EMUL_OKAY;
-
- case MSR_GS_BASE:
- if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
- break;
- wrgsbase(val);
- curr->arch.pv_vcpu.gs_base_kernel = val;
- return X86EMUL_OKAY;
-
- case MSR_SHADOW_GS_BASE:
- if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
- break;
- wrmsrl(MSR_SHADOW_GS_BASE, val);
- curr->arch.pv_vcpu.gs_base_user = val;
- return X86EMUL_OKAY;
-
- case MSR_K7_FID_VID_STATUS:
- case MSR_K7_FID_VID_CTL:
- case MSR_K8_PSTATE_LIMIT:
- case MSR_K8_PSTATE_CTRL:
- case MSR_K8_PSTATE_STATUS:
- case MSR_K8_PSTATE0:
- case MSR_K8_PSTATE1:
- case MSR_K8_PSTATE2:
- case MSR_K8_PSTATE3:
- case MSR_K8_PSTATE4:
- case MSR_K8_PSTATE5:
- case MSR_K8_PSTATE6:
- case MSR_K8_PSTATE7:
- case MSR_K8_HWCR:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
- break;
- if ( likely(!is_cpufreq_controller(currd)) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_AMD64_NB_CFG:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
- boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
- return X86EMUL_OKAY;
- if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
- ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
- goto invalid;
- if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_FAM10H_MMIO_CONF_BASE:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
- boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
- return X86EMUL_OKAY;
- if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
- break;
- if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
- temp != val :
- ((temp ^ val) &
- ~(FAM10H_MMIO_CONF_ENABLE |
- (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
- FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
- ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
- FAM10H_MMIO_CONF_BASE_SHIFT))) )
- goto invalid;
- if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_IA32_UCODE_REV:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
- return X86EMUL_OKAY;
- if ( rdmsr_safe(reg, temp) )
- break;
- if ( val )
- goto invalid;
- return X86EMUL_OKAY;
-
- case MSR_IA32_MISC_ENABLE:
- if ( rdmsr_safe(reg, temp) )
- break;
- if ( val != guest_misc_enable(temp) )
- goto invalid;
- return X86EMUL_OKAY;
-
- case MSR_IA32_MPERF:
- case MSR_IA32_APERF:
- if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
- break;
- if ( likely(!is_cpufreq_controller(currd)) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_IA32_PERF_CTL:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
- break;
- if ( likely(!is_cpufreq_controller(currd)) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_IA32_THERM_CONTROL:
- case MSR_IA32_ENERGY_PERF_BIAS:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_AMD64_DR0_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
- break;
- curr->arch.pv_vcpu.dr_mask[0] = val;
- if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
- wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
- return X86EMUL_OKAY;
-
- case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
- break;
- curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
- if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
- wrmsrl(reg, val);
- return X86EMUL_OKAY;
-
- case MSR_INTEL_PLATFORM_INFO:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
- break;
- return X86EMUL_OKAY;
-
- case MSR_INTEL_MISC_FEATURES_ENABLES:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
- rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
- break;
- if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
- !this_cpu(cpuid_faulting_enabled) )
- break;
- curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
- return X86EMUL_OKAY;
-
- case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
- case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
- case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
- case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
- {
- vpmu_msr = true;
- case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
- case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
- if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
- {
- if ( (vpmu_mode & XENPMU_MODE_ALL) &&
- !is_hardware_domain(currd) )
- return X86EMUL_OKAY;
-
- if ( vpmu_do_wrmsr(reg, val, 0) )
- break;
- return X86EMUL_OKAY;
- }
- }
- /* fall through */
- default:
- if ( wrmsr_hypervisor_regs(reg, val) == 1 )
- return X86EMUL_OKAY;
-
- rc = vmce_wrmsr(reg, val);
- if ( rc < 0 )
- break;
- if ( rc )
- return X86EMUL_OKAY;
-
- if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
- invalid:
- gdprintk(XENLOG_WARNING,
- "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
- reg, temp, val);
- return X86EMUL_OKAY;
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
-{
- /* Ignore the instruction if unprivileged. */
- if ( !cache_flush_permitted(current->domain) )
- /*
- * Non-physdev domain attempted WBINVD; ignore for now since
- * newer linux uses this in some start-of-day timing loops.
- */
- ;
- else
- wbinvd();
-
- return X86EMUL_OKAY;
-}
-
-int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
- struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
-{
- guest_cpuid(current, leaf, subleaf, res);
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_validate(const struct x86_emulate_state *state,
- struct x86_emulate_ctxt *ctxt)
-{
- switch ( ctxt->opcode )
- {
- case 0x6c ... 0x6f: /* ins / outs */
- case 0xe4 ... 0xe7: /* in / out (immediate port) */
- case 0xec ... 0xef: /* in / out (port in %dx) */
- case X86EMUL_OPC(0x0f, 0x06): /* clts */
- case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
- case X86EMUL_OPC(0x0f, 0x20) ...
- X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
- case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
- case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
- case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
- case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
- return X86EMUL_OKAY;
-
- case 0xfa: case 0xfb: /* cli / sti */
- if ( !iopl_ok(current, ctxt->regs) )
- break;
- /*
- * This is just too dangerous to allow, in my opinion. Consider if the
- * caller then tries to reenable interrupts using POPF: we can't trap
- * that and we'll end up with hard-to-debug lockups. Fast & loose will
- * do for us. :-)
- vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
- */
- return X86EMUL_DONE;
-
- case X86EMUL_OPC(0x0f, 0x01):
- {
- unsigned int modrm_rm, modrm_reg;
-
- if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
- (modrm_rm & 7) != 1 )
- break;
- switch ( modrm_reg & 7 )
- {
- case 2: /* xsetbv */
- case 7: /* rdtscp */
- return X86EMUL_OKAY;
- }
- break;
- }
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static const struct x86_emulate_ops priv_op_ops = {
- .insn_fetch = priv_op_insn_fetch,
- .read = x86emul_unhandleable_rw,
- .validate = priv_op_validate,
- .read_io = priv_op_read_io,
- .write_io = priv_op_write_io,
- .rep_ins = priv_op_rep_ins,
- .rep_outs = priv_op_rep_outs,
- .read_segment = priv_op_read_segment,
- .read_cr = priv_op_read_cr,
- .write_cr = priv_op_write_cr,
- .read_dr = priv_op_read_dr,
- .write_dr = priv_op_write_dr,
- .read_msr = priv_op_read_msr,
- .write_msr = priv_op_write_msr,
- .cpuid = pv_emul_cpuid,
- .wbinvd = priv_op_wbinvd,
-};
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
-{
- struct vcpu *curr = current;
- struct domain *currd = curr->domain;
- struct priv_op_ctxt ctxt = {
- .ctxt.regs = regs,
- .ctxt.vendor = currd->arch.cpuid->x86_vendor,
- };
- int rc;
- unsigned int eflags, ar;
-
- if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
- &ar, 1) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- !(ar & _SEGMENT_CODE) )
- return 0;
-
- /* Mirror virtualized state into EFLAGS. */
- ASSERT(regs->eflags & X86_EFLAGS_IF);
- if ( vcpu_info(curr, evtchn_upcall_mask) )
- regs->eflags &= ~X86_EFLAGS_IF;
- else
- regs->eflags |= X86_EFLAGS_IF;
- ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
- regs->eflags |= curr->arch.pv_vcpu.iopl;
- eflags = regs->eflags;
-
- ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
- /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
- rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
-
- if ( ctxt.io_emul_stub )
- unmap_domain_page(ctxt.io_emul_stub);
-
- /*
- * Un-mirror virtualized state from EFLAGS.
- * Nothing we allow to be emulated can change anything other than the
- * arithmetic bits, and the resume flag.
- */
- ASSERT(!((regs->eflags ^ eflags) &
- ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
- regs->eflags |= X86_EFLAGS_IF;
- regs->eflags &= ~X86_EFLAGS_IOPL;
-
- switch ( rc )
- {
- case X86EMUL_OKAY:
- if ( ctxt.tsc & TSC_BASE )
- {
- if ( ctxt.tsc & TSC_AUX )
- pv_soft_rdtsc(curr, regs, 1);
- else if ( currd->arch.vtsc )
- pv_soft_rdtsc(curr, regs, 0);
- else
- msr_split(regs, rdtsc());
- }
-
- if ( ctxt.ctxt.retire.singlestep )
- ctxt.bpmatch |= DR_STEP;
- if ( ctxt.bpmatch )
- {
- curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
- if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
- pv_inject_guest_trap(TRAP_debug, regs);
- }
- /* fall through */
- case X86EMUL_RETRY:
- return EXCRET_fault_fixed;
-
- case X86EMUL_EXCEPTION:
- pv_inject_event(&ctxt.ctxt.event);
- return EXCRET_fault_fixed;
- }
-
- return 0;
-}
-
-static inline int check_stack_limit(unsigned int ar, unsigned int limit,
- unsigned int esp, unsigned int decr)
-{
- return (((esp - decr) < (esp - 1)) &&
- (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
-}
-
-struct gate_op_ctxt {
- struct x86_emulate_ctxt ctxt;
- struct {
- unsigned long base, limit;
- } cs;
- bool insn_fetch;
-};
-
-static int gate_op_read(
- enum x86_segment seg,
- unsigned long offset,
- void *p_data,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
-{
- const struct gate_op_ctxt *goc =
- container_of(ctxt, struct gate_op_ctxt, ctxt);
- unsigned int rc = bytes, sel = 0;
- unsigned long addr = offset, limit = 0;
-
- switch ( seg )
- {
- case x86_seg_cs:
- addr += goc->cs.base;
- limit = goc->cs.limit;
- break;
- case x86_seg_ds:
- sel = read_sreg(ds);
- break;
- case x86_seg_es:
- sel = read_sreg(es);
- break;
- case x86_seg_fs:
- sel = read_sreg(fs);
- break;
- case x86_seg_gs:
- sel = read_sreg(gs);
- break;
- case x86_seg_ss:
- sel = ctxt->regs->ss;
- break;
- default:
- return X86EMUL_UNHANDLEABLE;
- }
- if ( sel )
- {
- unsigned int ar;
-
- ASSERT(!goc->insn_fetch);
- if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
- return X86EMUL_UNHANDLEABLE;
- addr += offset;
- }
- else if ( seg != x86_seg_cs )
- return X86EMUL_UNHANDLEABLE;
-
- /* We don't mean to emulate any branches. */
- if ( limit < bytes - 1 || offset > limit - bytes + 1 )
- return X86EMUL_UNHANDLEABLE;
-
- addr = (uint32_t)addr;
-
- if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
- {
- /*
- * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
- * cpu_has_nx, but we'd then need a "fetch" variant of
- * __copy_from_user() respecting NX, SMEP, and protection keys.
- */
- x86_emul_pagefault(0, addr + bytes - rc, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- return X86EMUL_OKAY;
-}
-
-static void emulate_gate_op(struct cpu_user_regs *regs)
-{
- struct vcpu *v = current;
- unsigned int sel, ar, dpl, nparm, insn_len;
- struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
- struct x86_emulate_state *state;
- unsigned long off, base, limit;
- uint16_t opnd_sel = 0;
- int jump = -1, rc = X86EMUL_OKAY;
-
- /* Check whether this fault is due to the use of a call gate. */
- if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
- (((ar >> 13) & 3) < (regs->cs & 3)) ||
- ((ar & _SEGMENT_TYPE) != 0xc00) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
- if ( !(ar & _SEGMENT_P) )
- {
- pv_inject_guest_trap(TRAP_no_segment, regs);
- return;
- }
- dpl = (ar >> 13) & 3;
- nparm = ar & 0x1f;
-
- /*
- * Decode instruction (and perhaps operand) to determine RPL,
- * whether this is a jump or a call, and the call return offset.
- */
- if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
- &ar, 0) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- !(ar & _SEGMENT_CODE) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
-
- ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
- /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
- state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
- ctxt.insn_fetch = false;
- if ( IS_ERR_OR_NULL(state) )
- {
- if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
- {
- ASSERT(ctxt.ctxt.event_pending);
- pv_inject_event(&ctxt.ctxt.event);
- }
- else
- {
- ASSERT(!ctxt.ctxt.event_pending);
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- }
- return;
- }
-
- switch ( ctxt.ctxt.opcode )
- {
- unsigned int modrm_345;
-
- case 0xea:
- ++jump;
- /* fall through */
- case 0x9a:
- ++jump;
- opnd_sel = x86_insn_immediate(state, 1);
- break;
- case 0xff:
- if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
- break;
- switch ( modrm_345 & 7 )
- {
- enum x86_segment seg;
-
- case 5:
- ++jump;
- /* fall through */
- case 3:
- ++jump;
- base = x86_insn_operand_ea(state, &seg);
- rc = gate_op_read(seg,
- base + (x86_insn_opsize(state) >> 3),
- &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
- break;
- }
- break;
- }
-
- insn_len = x86_insn_length(state, &ctxt.ctxt);
- x86_emulate_free_state(state);
-
- if ( rc == X86EMUL_EXCEPTION )
- {
- ASSERT(ctxt.ctxt.event_pending);
- pv_inject_event(&ctxt.ctxt.event);
- return;
- }
-
- ASSERT(!ctxt.ctxt.event_pending);
-
- if ( rc != X86EMUL_OKAY ||
- jump < 0 ||
- (opnd_sel & ~3) != regs->error_code ||
- dpl < (opnd_sel & 3) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
-
- if ( !read_descriptor(sel, v, &base, &limit, &ar, 0) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_CODE) ||
- (!jump || (ar & _SEGMENT_EC) ?
- ((ar >> 13) & 3) > (regs->cs & 3) :
- ((ar >> 13) & 3) != (regs->cs & 3)) )
- {
- pv_inject_hw_exception(TRAP_gp_fault, sel);
- return;
- }
- if ( !(ar & _SEGMENT_P) )
- {
- pv_inject_hw_exception(TRAP_no_segment, sel);
- return;
- }
- if ( off > limit )
- {
- pv_inject_hw_exception(TRAP_gp_fault, 0);
- return;
- }
-
- if ( !jump )
- {
- unsigned int ss, esp, *stkp;
- int rc;
-#define push(item) do \
- { \
- --stkp; \
- esp -= 4; \
- rc = __put_user(item, stkp); \
- if ( rc ) \
- { \
- pv_inject_page_fault(PFEC_write_access, \
- (unsigned long)(stkp + 1) - rc); \
- return; \
- } \
- } while ( 0 )
-
- if ( ((ar >> 13) & 3) < (regs->cs & 3) )
- {
- sel |= (ar >> 13) & 3;
- /* Inner stack known only for kernel ring. */
- if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
- esp = v->arch.pv_vcpu.kernel_sp;
- ss = v->arch.pv_vcpu.kernel_ss;
- if ( (ss & 3) != (sel & 3) ||
- !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
- ((ar >> 13) & 3) != (sel & 3) ||
- !(ar & _SEGMENT_S) ||
- (ar & _SEGMENT_CODE) ||
- !(ar & _SEGMENT_WR) )
- {
- pv_inject_hw_exception(TRAP_invalid_tss, ss & ~3);
- return;
- }
- if ( !(ar & _SEGMENT_P) ||
- !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
- {
- pv_inject_hw_exception(TRAP_stack_error, ss & ~3);
- return;
- }
- stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
- if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
- push(regs->ss);
- push(regs->rsp);
- if ( nparm )
- {
- const unsigned int *ustkp;
-
- if ( !read_descriptor(regs->ss, v, &base, &limit, &ar, 0) ||
- ((ar >> 13) & 3) != (regs->cs & 3) ||
- !(ar & _SEGMENT_S) ||
- (ar & _SEGMENT_CODE) ||
- !(ar & _SEGMENT_WR) ||
- !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
- return pv_inject_guest_trap(TRAP_gp_fault, regs);
- ustkp = (unsigned int *)(unsigned long)
- ((unsigned int)base + regs->esp + nparm * 4);
- if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
- do
- {
- unsigned int parm;
-
- --ustkp;
- rc = __get_user(parm, ustkp);
- if ( rc )
- {
- pv_inject_page_fault(0,
- (unsigned long)(ustkp + 1) - rc);
- return;
- }
- push(parm);
- } while ( --nparm );
- }
- }
- else
- {
- sel |= (regs->cs & 3);
- esp = regs->rsp;
- ss = regs->ss;
- if ( !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
- ((ar >> 13) & 3) != (sel & 3) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
- if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
- {
- pv_inject_hw_exception(TRAP_stack_error, 0);
- return;
- }
- stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
- if ( !compat_access_ok(stkp - 2, 2 * 4) )
- {
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
- }
- }
- push(regs->cs);
- push(regs->rip + insn_len);
-#undef push
- regs->rsp = esp;
- regs->ss = ss;
- }
- else
- sel |= (regs->cs & 3);
-
- regs->cs = sel;
- instruction_done(regs, off);
-}
-
-void do_general_protection(struct cpu_user_regs *regs)
-{
- struct vcpu *v = current;
- unsigned long fixup;
-
- if ( debugger_trap_entry(TRAP_gp_fault, regs) )
- return;
-
- if ( regs->error_code & X86_XEC_EXT )
- goto hardware_gp;
-
- if ( !guest_mode(regs) )
- goto gp_in_kernel;
-
- /*
- * Cunning trick to allow arbitrary "INT n" handling.
- *
- * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
- * instruction from trapping to the appropriate vector, when that might not
- * be expected by Xen or the guest OS. For example, that entry might be for
- * a fault handler (unlike traps, faults don't increment EIP), or might
- * expect an error code on the stack (which a software trap never
- * provides), or might be a hardware interrupt handler that doesn't like
- * being called spuriously.
- *
- * Instead, a GPF occurs with the faulting IDT vector in the error code.
- * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
- * clear (which got already checked above) to indicate that it's a software
- * fault, not a hardware one.
- *
- * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
- * okay because they can only be triggered by an explicit DPL-checked
- * instruction. The DPL specified by the guest OS for these vectors is NOT
- * CHECKED!!
- */
- if ( regs->error_code & X86_XEC_IDT )
- {
- /* This fault must be due to <INT n> instruction. */
- const struct trap_info *ti;
- unsigned char vector = regs->error_code >> 3;
- ti = &v->arch.pv_vcpu.trap_ctxt[vector];
- if ( permit_softint(TI_GET_DPL(ti), v, regs) )
- {
- regs->rip += 2;
- pv_inject_guest_trap(vector, regs);
- return;
- }
- }
- else if ( is_pv_32bit_vcpu(v) && regs->error_code )
- {
- emulate_gate_op(regs);
- return;
- }
-
- /* Emulate some simple privileged and I/O instructions. */
- if ( (regs->error_code == 0) &&
- emulate_privileged_op(regs) )
- {
- trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
- return;
- }
-
- /* Pass on GPF as is. */
- pv_inject_guest_trap(TRAP_gp_fault, regs);
- return;
-
- gp_in_kernel:
-
- if ( likely((fixup = search_exception_table(regs)) != 0) )
- {
- dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
- regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
- this_cpu(last_extable_addr) = regs->rip;
- regs->rip = fixup;
- return;
- }
-
- hardware_gp:
- if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
- return;
-
- show_execution_state(regs);
- panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
-}
-
-static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
-
-static void nmi_mce_softirq(void)
-{
- int cpu = smp_processor_id();
- struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
-
- BUG_ON(st->vcpu == NULL);
-
- /* Set the tmp value unconditionally, so that
- * the check in the iret hypercall works. */
- cpumask_copy(st->vcpu->cpu_hard_affinity_tmp,
- st->vcpu->cpu_hard_affinity);
-
- if ((cpu != st->processor)
- || (st->processor != st->vcpu->processor))
- {
- /* We are on a different physical cpu.
- * Make sure to wakeup the vcpu on the
- * specified processor.
- */
- vcpu_set_hard_affinity(st->vcpu, cpumask_of(st->processor));
-
- /* Affinity is restored in the iret hypercall. */
- }
-
- /* Only used to defer wakeup of domain/vcpu to
- * a safe (non-NMI/MCE) context.
- */
- vcpu_kick(st->vcpu);
- st->vcpu = NULL;
-}
-
-static void pci_serr_softirq(void)
-{
- printk("\n\nNMI - PCI system error (SERR)\n");
- outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
-}
-
-void async_exception_cleanup(struct vcpu *curr)
-{
- int trap;
-
- if ( !curr->async_exception_mask )
- return;
-
- /* Restore affinity. */
- if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
- !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
- {
- vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
- cpumask_clear(curr->cpu_hard_affinity_tmp);
- }
-
- if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
- trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
- else
- for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
- if ( (curr->async_exception_mask ^
- curr->async_exception_state(trap).old_mask) == (1 << trap) )
- break;
- if ( unlikely(trap > VCPU_TRAP_LAST) )
- {
- ASSERT_UNREACHABLE();
- return;
- }
-
- /* Restore previous asynchronous exception mask. */
- curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
-}
-
-static void nmi_hwdom_report(unsigned int reason_idx)
-{
- struct domain *d = hardware_domain;
-
- if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
- return;
-
- set_bit(reason_idx, nmi_reason(d));
-
- send_guest_trap(d, 0, TRAP_nmi);
-}
+ send_guest_trap(d, 0, TRAP_nmi);
+}
static void pci_serr_error(const struct cpu_user_regs *regs)
{
@@ -3909,34 +1901,6 @@ long register_guest_nmi_callback(unsigned long address)
return 0;
}
-long unregister_guest_nmi_callback(void)
-{
- struct vcpu *v = current;
- struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi];
-
- memset(t, 0, sizeof(*t));
-
- return 0;
-}
-
-int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
-{
- struct vcpu *v;
- struct trap_info *t;
-
- BUG_ON(d == NULL);
- BUG_ON(vcpuid >= d->max_vcpus);
-
- /* Sanity check - XXX should be more fine grained. */
- BUG_ON(trap_nr >= NR_VECTORS);
-
- v = d->vcpu[vcpuid];
- t = &v->arch.pv_vcpu.trap_ctxt[trap_nr];
-
- return (t->address != 0);
-}
-
-
int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
{
struct vcpu *v;
@@ -3984,56 +1948,6 @@ int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
return -EIO;
}
-
-long do_set_trap_table(XEN_GUEST_HANDLE_PARAM(const_trap_info_t) traps)
-{
- struct trap_info cur;
- struct vcpu *curr = current;
- struct trap_info *dst = curr->arch.pv_vcpu.trap_ctxt;
- long rc = 0;
-
- /* If no table is presented then clear the entire virtual IDT. */
- if ( guest_handle_is_null(traps) )
- {
- memset(dst, 0, NR_VECTORS * sizeof(*dst));
- init_int80_direct_trap(curr);
- return 0;
- }
-
- for ( ; ; )
- {
- if ( copy_from_guest(&cur, traps, 1) )
- {
- rc = -EFAULT;
- break;
- }
-
- if ( cur.address == 0 )
- break;
-
- if ( !is_canonical_address(cur.address) )
- return -EINVAL;
-
- fixup_guest_code_selector(curr->domain, cur.cs);
-
- memcpy(&dst[cur.vector], &cur, sizeof(cur));
-
- if ( cur.vector == 0x80 )
- init_int80_direct_trap(curr);
-
- guest_handle_add_offset(traps, 1);
-
- if ( hypercall_preempt_check() )
- {
- rc = hypercall_create_continuation(
- __HYPERVISOR_set_trap_table, "h", traps);
- break;
- }
- }
-
- return rc;
-}
-
void activate_debugregs(const struct vcpu *curr)
{
ASSERT(curr == current);
@@ -4157,31 +2071,6 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
return 0;
}
-long do_set_debugreg(int reg, unsigned long value)
-{
- return set_debugreg(current, reg, value);
-}
-
-unsigned long do_get_debugreg(int reg)
-{
- struct vcpu *curr = current;
-
- switch ( reg )
- {
- case 0 ... 3:
- case 6:
- return curr->arch.debugreg[reg];
- case 7:
- return (curr->arch.debugreg[7] |
- curr->arch.debugreg[5]);
- case 4 ... 5:
- return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
- curr->arch.debugreg[reg + 2] : 0);
- }
-
- return -EINVAL;
-}
-
void asm_domain_crash_synchronous(unsigned long addr)
{
/*
diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
index e3884d8406..a8656c3574 100644
--- a/xen/include/asm-x86/traps.h
+++ b/xen/include/asm-x86/traps.h
@@ -51,4 +51,23 @@ uint32_t guest_io_read(unsigned int port, unsigned int bytes,
void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
struct domain *);
+int emulate_privileged_op(struct cpu_user_regs *regs);
+int emulate_invalid_rdtscp(struct cpu_user_regs *regs);
+int emulate_forced_invalid_op(struct cpu_user_regs *regs);
+void emulate_gate_op(struct cpu_user_regs *regs);
+
+static inline const char *trapstr(unsigned int trapnr)
+{
+ static const char * const strings[] = {
+ "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
+ "invalid opcode", "device not available", "double fault",
+ "coprocessor segment", "invalid tss", "segment not found",
+ "stack error", "general protection fault", "page fault",
+ "spurious interrupt", "coprocessor error", "alignment check",
+ "machine check", "simd error", "virtualisation exception"
+ };
+
+ return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
+}
+
#endif /* ASM_TRAP_H */
--
2.11.0
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply related [flat|nested] 21+ messages in thread