From: Wei Liu <wei.liu2@citrix.com>
To: Xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>,
Wei Liu <wei.liu2@citrix.com>, Jan Beulich <JBeulich@suse.com>
Subject: [PATCH for-next v3 01/22] x86/traps: move privilege instruction emulation code
Date: Thu, 18 May 2017 18:09:43 +0100 [thread overview]
Message-ID: <20170518171004.27204-2-wei.liu2@citrix.com> (raw)
In-Reply-To: <20170518171004.27204-1-wei.liu2@citrix.com>
Move relevant code to pv/emulate.c. Export emulate_privileged_op in
pv/traps.h.
Note that read_descriptor is duplicated in emulate.c. The duplication
will be gone once all emulation code is moved.
No functional change.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
xen/arch/x86/pv/Makefile | 1 +
xen/arch/x86/pv/emulate.c | 1470 ++++++++++++++++++++++++++++++++++++++++
xen/arch/x86/traps.c | 1358 +------------------------------------
xen/include/asm-x86/pv/traps.h | 48 ++
4 files changed, 1521 insertions(+), 1356 deletions(-)
create mode 100644 xen/arch/x86/pv/emulate.c
create mode 100644 xen/include/asm-x86/pv/traps.h
diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile
index 489a9f59cb..564202cbb7 100644
--- a/xen/arch/x86/pv/Makefile
+++ b/xen/arch/x86/pv/Makefile
@@ -3,3 +3,4 @@ obj-y += traps.o
obj-bin-y += dom0_build.init.o
obj-y += domain.o
+obj-y += emulate.o
diff --git a/xen/arch/x86/pv/emulate.c b/xen/arch/x86/pv/emulate.c
new file mode 100644
index 0000000000..fb0d066a3b
--- /dev/null
+++ b/xen/arch/x86/pv/emulate.c
@@ -0,0 +1,1470 @@
+/******************************************************************************
+ * arch/x86/pv/emulate.c
+ *
+ * PV emulation code
+ *
+ * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
+#include <xen/iocap.h>
+#include <xen/spinlock.h>
+#include <xen/trace.h>
+
+#include <asm/apic.h>
+#include <asm/debugreg.h>
+#include <asm/hpet.h>
+#include <asm/hypercall.h>
+#include <asm/mc146818rtc.h>
+#include <asm/p2m.h>
+#include <asm/pv/traps.h>
+#include <asm/shared.h>
+#include <asm/traps.h>
+#include <asm/x86_emulate.h>
+
+#include <xsm/xsm.h>
+
+#include "../x86_64/mmconfig.h"
+
+/******************
+ * Helper functions
+ */
+
+static int read_descriptor(unsigned int sel,
+ const struct vcpu *v,
+ unsigned long *base,
+ unsigned long *limit,
+ unsigned int *ar,
+ bool_t insn_fetch)
+{
+ struct desc_struct desc;
+
+ if ( sel < 4)
+ desc.b = desc.a = 0;
+ else if ( __get_user(desc,
+ (const struct desc_struct *)(!(sel & 4)
+ ? GDT_VIRT_START(v)
+ : LDT_VIRT_START(v))
+ + (sel >> 3)) )
+ return 0;
+ if ( !insn_fetch )
+ desc.b &= ~_SEGMENT_L;
+
+ *ar = desc.b & 0x00f0ff00;
+ if ( !(desc.b & _SEGMENT_L) )
+ {
+ *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
+ (desc.b & 0xff000000));
+ *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
+ if ( desc.b & _SEGMENT_G )
+ *limit = ((*limit + 1) << 12) - 1;
+#ifndef NDEBUG
+ if ( sel > 3 )
+ {
+ unsigned int a, l;
+ unsigned char valid;
+
+ asm volatile (
+ "larl %2,%0 ; setz %1"
+ : "=r" (a), "=qm" (valid) : "rm" (sel));
+ BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
+ asm volatile (
+ "lsll %2,%0 ; setz %1"
+ : "=r" (l), "=qm" (valid) : "rm" (sel));
+ BUG_ON(valid && (l != *limit));
+ }
+#endif
+ }
+ else
+ {
+ *base = 0UL;
+ *limit = ~0UL;
+ }
+
+ return 1;
+}
+
+/***********************
+ * I/O emulation support
+ */
+
+struct priv_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ char *io_emul_stub;
+ unsigned int bpmatch;
+ unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
+void host_to_guest_gpr_switch(struct cpu_user_regs *);
+unsigned long guest_to_host_gpr_switch(unsigned long);
+
+void (*pv_post_outb_hook)(unsigned int port, u8 value);
+
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+ unsigned int port, unsigned int bytes)
+{
+ if ( !ctxt->io_emul_stub )
+ ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+ (this_cpu(stubs.addr) &
+ ~PAGE_MASK) +
+ STUB_BUF_SIZE / 2;
+
+ /* movq $host_to_guest_gpr_switch,%rcx */
+ ctxt->io_emul_stub[0] = 0x48;
+ ctxt->io_emul_stub[1] = 0xb9;
+ *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+ /* callq *%rcx */
+ ctxt->io_emul_stub[10] = 0xff;
+ ctxt->io_emul_stub[11] = 0xd1;
+ /* data16 or nop */
+ ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+ /* <io-access opcode> */
+ ctxt->io_emul_stub[13] = opcode;
+ /* imm8 or nop */
+ ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+ /* ret (jumps to guest_to_host_gpr_switch) */
+ ctxt->io_emul_stub[15] = 0xc3;
+ BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+ if ( ioemul_handle_quirk )
+ ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+ /* Handy function-typed pointer to the stub. */
+ return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+
+/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
+static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
+{
+ unsigned int cpl = guest_kernel_mode(v, regs) ?
+ (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
+
+ ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
+
+ return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
+}
+
+/* Has the guest requested sufficient permission for this I/O access? */
+static int guest_io_okay(
+ unsigned int port, unsigned int bytes,
+ struct vcpu *v, struct cpu_user_regs *regs)
+{
+ /* If in user mode, switch to kernel mode just to read I/O bitmap. */
+ int user_mode = !(v->arch.flags & TF_kernel_mode);
+#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
+
+ if ( iopl_ok(v, regs) )
+ return 1;
+
+ if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
+ {
+ union { uint8_t bytes[2]; uint16_t mask; } x;
+
+ /*
+ * Grab permission bytes from guest space. Inaccessible bytes are
+ * read as 0xff (no access allowed).
+ */
+ TOGGLE_MODE();
+ switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
+ port>>3, 2) )
+ {
+ default: x.bytes[0] = ~0;
+ /* fallthrough */
+ case 1: x.bytes[1] = ~0;
+ /* fallthrough */
+ case 0: break;
+ }
+ TOGGLE_MODE();
+
+ if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Has the administrator granted sufficient permission for this I/O access? */
+static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
+ const struct domain *d)
+{
+ /*
+ * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
+ * We never permit direct access to that register.
+ */
+ if ( (port == 0xcf8) && (bytes == 4) )
+ return 0;
+
+ /* We also never permit direct access to the RTC/CMOS registers. */
+ if ( ((port & ~1) == RTC_PORT(0)) )
+ return 0;
+
+ return ioports_access_permitted(d, port, port + bytes - 1);
+}
+
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
+ unsigned int size, uint32_t *write)
+{
+ uint32_t machine_bdf;
+
+ if ( !is_hardware_domain(currd) )
+ return 0;
+
+ if ( !CF8_ENABLED(currd->arch.pci_cf8) )
+ return 1;
+
+ machine_bdf = CF8_BDF(currd->arch.pci_cf8);
+ if ( write )
+ {
+ const unsigned long *ro_map = pci_get_ro_map(0);
+
+ if ( ro_map && test_bit(machine_bdf, ro_map) )
+ return 0;
+ }
+ start |= CF8_ADDR_LO(currd->arch.pci_cf8);
+ /* AMD extended configuration space access? */
+ if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
+ {
+ uint64_t msr_val;
+
+ if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
+ return 0;
+ if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
+ start |= CF8_ADDR_HI(currd->arch.pci_cf8);
+ }
+
+ return !write ?
+ xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+ start, start + size - 1, 0) == 0 :
+ pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
+}
+
+uint32_t guest_io_read(unsigned int port, unsigned int bytes,
+ struct domain *currd)
+{
+ uint32_t data = 0;
+ unsigned int shift = 0;
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ switch ( bytes )
+ {
+ case 1: return inb(port);
+ case 2: return inw(port);
+ case 4: return inl(port);
+ }
+ }
+
+ while ( bytes != 0 )
+ {
+ unsigned int size = 1;
+ uint32_t sub_data = ~0;
+
+ if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+ {
+ sub_data = pv_pit_handler(port, 0, 0);
+ }
+ else if ( port == RTC_PORT(0) )
+ {
+ sub_data = currd->arch.cmos_idx;
+ }
+ else if ( (port == RTC_PORT(1)) &&
+ ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+ sub_data = inb(RTC_PORT(1));
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ }
+ else if ( (port == 0xcf8) && (bytes == 4) )
+ {
+ size = 4;
+ sub_data = currd->arch.pci_cf8;
+ }
+ else if ( (port & 0xfffc) == 0xcfc )
+ {
+ size = min(bytes, 4 - (port & 3));
+ if ( size == 3 )
+ size = 2;
+ if ( pci_cfg_ok(currd, port & 3, size, NULL) )
+ sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
+ }
+
+ if ( size == 4 )
+ return sub_data;
+
+ data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
+ shift += size * 8;
+ port += size;
+ bytes -= size;
+ }
+
+ return data;
+}
+
+static unsigned int check_guest_io_breakpoint(struct vcpu *v,
+ unsigned int port, unsigned int len)
+{
+ unsigned int width, i, match = 0;
+ unsigned long start;
+
+ if ( !(v->arch.debugreg[5]) ||
+ !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
+ return 0;
+
+ for ( i = 0; i < 4; i++ )
+ {
+ if ( !(v->arch.debugreg[5] &
+ (3 << (i * DR_ENABLE_SIZE))) )
+ continue;
+
+ start = v->arch.debugreg[i];
+ width = 0;
+
+ switch ( (v->arch.debugreg[7] >>
+ (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
+ {
+ case DR_LEN_1: width = 1; break;
+ case DR_LEN_2: width = 2; break;
+ case DR_LEN_4: width = 4; break;
+ case DR_LEN_8: width = 8; break;
+ }
+
+ if ( (start < (port + len)) && ((start + width) > port) )
+ match |= 1 << i;
+ }
+
+ return match;
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+ unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* INS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ return X86EMUL_DONE;
+ }
+
+ *val = guest_io_read(port, bytes, currd);
+
+ return X86EMUL_OKAY;
+}
+
+void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
+ struct domain *currd)
+{
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ switch ( bytes ) {
+ case 1:
+ outb((uint8_t)data, port);
+ if ( pv_post_outb_hook )
+ pv_post_outb_hook(port, (uint8_t)data);
+ break;
+ case 2:
+ outw((uint16_t)data, port);
+ break;
+ case 4:
+ outl(data, port);
+ break;
+ }
+ return;
+ }
+
+ while ( bytes != 0 )
+ {
+ unsigned int size = 1;
+
+ if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+ {
+ pv_pit_handler(port, (uint8_t)data, 1);
+ }
+ else if ( port == RTC_PORT(0) )
+ {
+ currd->arch.cmos_idx = data;
+ }
+ else if ( (port == RTC_PORT(1)) &&
+ ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+ {
+ unsigned long flags;
+
+ if ( pv_rtc_handler )
+ pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
+ spin_lock_irqsave(&rtc_lock, flags);
+ outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+ outb(data, RTC_PORT(1));
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ }
+ else if ( (port == 0xcf8) && (bytes == 4) )
+ {
+ size = 4;
+ currd->arch.pci_cf8 = data;
+ }
+ else if ( (port & 0xfffc) == 0xcfc )
+ {
+ size = min(bytes, 4 - (port & 3));
+ if ( size == 3 )
+ size = 2;
+ if ( pci_cfg_ok(currd, port & 3, size, &data) )
+ pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
+ }
+
+ if ( size == 4 )
+ return;
+
+ port += size;
+ bytes -= size;
+ data >>= size * 8;
+ }
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+ unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* OUTS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ if ( (bytes == 1) && pv_post_outb_hook )
+ pv_post_outb_hook(port, val);
+ return X86EMUL_DONE;
+ }
+
+ guest_io_write(port, bytes, val, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+ struct segment_register *reg,
+ struct x86_emulate_ctxt *ctxt)
+{
+ /* Check if this is an attempt to access the I/O bitmap. */
+ if ( seg == x86_seg_tr )
+ {
+ switch ( ctxt->opcode )
+ {
+ case 0x6c ... 0x6f: /* ins / outs */
+ case 0xe4 ... 0xe7: /* in / out (immediate port) */
+ case 0xec ... 0xef: /* in / out (port in %dx) */
+ /* Defer the check to priv_op_{read,write}_io(). */
+ return X86EMUL_DONE;
+ }
+ }
+
+ if ( ctxt->addr_size < 64 )
+ {
+ unsigned long limit;
+ unsigned int sel, ar;
+
+ switch ( seg )
+ {
+ case x86_seg_cs: sel = ctxt->regs->cs; break;
+ case x86_seg_ds: sel = read_sreg(ds); break;
+ case x86_seg_es: sel = read_sreg(es); break;
+ case x86_seg_fs: sel = read_sreg(fs); break;
+ case x86_seg_gs: sel = read_sreg(gs); break;
+ case x86_seg_ss: sel = ctxt->regs->ss; break;
+ default: return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) )
+ return X86EMUL_UNHANDLEABLE;
+
+ reg->limit = limit;
+ reg->attr.bytes = ar >> 8;
+ }
+ else
+ {
+ switch ( seg )
+ {
+ default:
+ if ( !is_x86_user_segment(seg) )
+ return X86EMUL_UNHANDLEABLE;
+ reg->base = 0;
+ break;
+ case x86_seg_fs:
+ reg->base = rdfsbase();
+ break;
+ case x86_seg_gs:
+ reg->base = rdgsbase();
+ break;
+ }
+
+ reg->limit = ~0U;
+
+ reg->attr.bytes = 0;
+ reg->attr.fields.type = _SEGMENT_WR >> 8;
+ if ( seg == x86_seg_cs )
+ {
+ reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+ reg->attr.fields.l = 1;
+ }
+ else
+ reg->attr.fields.db = 1;
+ reg->attr.fields.s = 1;
+ reg->attr.fields.dpl = 3;
+ reg->attr.fields.p = 1;
+ reg->attr.fields.g = 1;
+ }
+
+ /*
+ * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+ * Also do this for consistency for non-conforming code segments.
+ */
+ if ( (seg == x86_seg_ss ||
+ (seg == x86_seg_cs &&
+ !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+ guest_kernel_mode(current, ctxt->regs) )
+ reg->attr.fields.dpl = 0;
+
+ return X86EMUL_OKAY;
+}
+
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+ unsigned int bytes, unsigned long limit,
+ enum x86_segment seg,
+ struct x86_emulate_ctxt *ctxt,
+ unsigned long *addr)
+{
+ int rc = X86EMUL_OKAY;
+
+ *addr = base + offset;
+
+ if ( ctxt->addr_size < 64 )
+ {
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ rc = X86EMUL_EXCEPTION;
+ *addr = (uint32_t)*addr;
+ }
+ else if ( !__addr_ok(*addr) )
+ rc = X86EMUL_EXCEPTION;
+
+ if ( unlikely(rc == X86EMUL_EXCEPTION) )
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+
+ return rc;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+ enum x86_segment seg, unsigned long offset,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ ASSERT(seg == x86_seg_es);
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+ {
+ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, x86_seg_es, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(PFEC_write_access,
+ addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+ uint16_t port,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(seg, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+ {
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = 0;
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, seg, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ guest_io_write(port, bytes_per_rep, data, currd);
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct vcpu *curr = current;
+
+ switch ( reg )
+ {
+ case 0: /* Read CR0 */
+ *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+ return X86EMUL_OKAY;
+
+ case 2: /* Read CR2 */
+ case 4: /* Read CR4 */
+ *val = curr->arch.pv_vcpu.ctrlreg[reg];
+ return X86EMUL_OKAY;
+
+ case 3: /* Read CR3 */
+ {
+ const struct domain *currd = curr->domain;
+ unsigned long mfn;
+
+ if ( !is_pv_32bit_domain(currd) )
+ {
+ mfn = pagetable_get_pfn(curr->arch.guest_table);
+ *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+ }
+ else
+ {
+ l4_pgentry_t *pl4e =
+ map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+ mfn = l4e_get_pfn(*pl4e);
+ unmap_domain_page(pl4e);
+ *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+ }
+ /* PTs should not be shared */
+ BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+ return X86EMUL_OKAY;
+ }
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *curr = current;
+
+ switch ( reg )
+ {
+ case 0: /* Write CR0 */
+ if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+ {
+ gdprintk(XENLOG_WARNING,
+ "Attempt to change unmodifiable CR0 flags\n");
+ break;
+ }
+ do_fpu_taskswitch(!!(val & X86_CR0_TS));
+ return X86EMUL_OKAY;
+
+ case 2: /* Write CR2 */
+ curr->arch.pv_vcpu.ctrlreg[2] = val;
+ arch_set_cr2(curr, val);
+ return X86EMUL_OKAY;
+
+ case 3: /* Write CR3 */
+ {
+ struct domain *currd = curr->domain;
+ unsigned long gfn;
+ struct page_info *page;
+ int rc;
+
+ gfn = !is_pv_32bit_domain(currd)
+ ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+ page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+ if ( !page )
+ break;
+ rc = new_guest_cr3(page_to_mfn(page));
+ put_page(page);
+
+ switch ( rc )
+ {
+ case 0:
+ return X86EMUL_OKAY;
+ case -ERESTART: /* retry after preemption */
+ return X86EMUL_RETRY;
+ }
+ break;
+ }
+
+ case 4: /* Write CR4 */
+ curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+ write_cr4(pv_guest_cr4_to_real_cr4(curr));
+ ctxt_switch_levelling(curr);
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long res = do_get_debugreg(reg);
+
+ if ( IS_ERR_VALUE(res) )
+ return X86EMUL_UNHANDLEABLE;
+
+ *val = res;
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ return do_set_debugreg(reg, val) == 0
+ ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
+static inline uint64_t guest_misc_enable(uint64_t val)
+{
+ val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+ MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
+ val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+ return val;
+}
+
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+ return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+ is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ const struct vcpu *curr = current;
+ const struct domain *currd = curr->domain;
+ bool vpmu_msr = false;
+
+ switch ( reg )
+ {
+ int rc;
+
+ case MSR_FS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+ *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+ return X86EMUL_OKAY;
+
+ case MSR_GS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+ *val = cpu_has_fsgsbase ? __rdgsbase()
+ : curr->arch.pv_vcpu.gs_base_kernel;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+ *val = curr->arch.pv_vcpu.gs_base_user;
+ return X86EMUL_OKAY;
+
+ /*
+ * In order to fully retain original behavior, defer calling
+ * pv_soft_rdtsc() until after emulation. This may want/need to be
+ * reconsidered.
+ */
+ case MSR_IA32_TSC:
+ poc->tsc |= TSC_BASE;
+ goto normal;
+
+ case MSR_TSC_AUX:
+ poc->tsc |= TSC_AUX;
+ if ( cpu_has_rdtscp )
+ goto normal;
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_EFER:
+ *val = read_efer();
+ if ( is_pv_32bit_domain(currd) )
+ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+ return X86EMUL_OKAY;
+
+ case MSR_K7_FID_VID_CTL:
+ case MSR_K7_FID_VID_STATUS:
+ case MSR_K8_PSTATE_LIMIT:
+ case MSR_K8_PSTATE_CTRL:
+ case MSR_K8_PSTATE_STATUS:
+ case MSR_K8_PSTATE0:
+ case MSR_K8_PSTATE1:
+ case MSR_K8_PSTATE2:
+ case MSR_K8_PSTATE3:
+ case MSR_K8_PSTATE4:
+ case MSR_K8_PSTATE5:
+ case MSR_K8_PSTATE6:
+ case MSR_K8_PSTATE7:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+ break;
+ if ( unlikely(is_cpufreq_controller(currd)) )
+ goto normal;
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_UCODE_REV:
+ BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ {
+ if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+ break;
+ /* As documented in the SDM: Do a CPUID 1 here */
+ cpuid_eax(1);
+ }
+ goto normal;
+
+ case MSR_IA32_MISC_ENABLE:
+ if ( rdmsr_safe(reg, *val) )
+ break;
+ *val = guest_misc_enable(*val);
+ return X86EMUL_OKAY;
+
+ case MSR_AMD64_DR0_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+ break;
+ *val = curr->arch.pv_vcpu.dr_mask[0];
+ return X86EMUL_OKAY;
+
+ case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+ break;
+ *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_PERF_CAPABILITIES:
+ /* No extra capabilities are supported. */
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_PLATFORM_INFO:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
+ break;
+ *val = 0;
+ if ( this_cpu(cpuid_faulting_enabled) )
+ *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_MISC_FEATURES_ENABLES:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
+ break;
+ *val = 0;
+ if ( curr->arch.cpuid_faulting )
+ *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
+ return X86EMUL_OKAY;
+
+ case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+ case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+ case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+ case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ {
+ vpmu_msr = true;
+ /* fall through */
+ case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+ case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+ if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+ {
+ if ( vpmu_do_rdmsr(reg, val) )
+ break;
+ return X86EMUL_OKAY;
+ }
+ }
+ /* fall through */
+ default:
+ if ( rdmsr_hypervisor_regs(reg, val) )
+ return X86EMUL_OKAY;
+
+ rc = vmce_rdmsr(reg, val);
+ if ( rc < 0 )
+ break;
+ if ( rc )
+ return X86EMUL_OKAY;
+ /* fall through */
+ normal:
+ /* Everyone can read the MSR space. */
+ /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+ if ( rdmsr_safe(reg, *val) )
+ break;
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *curr = current;
+ const struct domain *currd = curr->domain;
+ bool vpmu_msr = false;
+
+ switch ( reg )
+ {
+ uint64_t temp;
+ int rc;
+
+ case MSR_FS_BASE:
+ if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+ break;
+ wrfsbase(val);
+ curr->arch.pv_vcpu.fs_base = val;
+ return X86EMUL_OKAY;
+
+ case MSR_GS_BASE:
+ if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+ break;
+ wrgsbase(val);
+ curr->arch.pv_vcpu.gs_base_kernel = val;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+ if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+ break;
+ wrmsrl(MSR_SHADOW_GS_BASE, val);
+ curr->arch.pv_vcpu.gs_base_user = val;
+ return X86EMUL_OKAY;
+
+ case MSR_K7_FID_VID_STATUS:
+ case MSR_K7_FID_VID_CTL:
+ case MSR_K8_PSTATE_LIMIT:
+ case MSR_K8_PSTATE_CTRL:
+ case MSR_K8_PSTATE_STATUS:
+ case MSR_K8_PSTATE0:
+ case MSR_K8_PSTATE1:
+ case MSR_K8_PSTATE2:
+ case MSR_K8_PSTATE3:
+ case MSR_K8_PSTATE4:
+ case MSR_K8_PSTATE5:
+ case MSR_K8_PSTATE6:
+ case MSR_K8_PSTATE7:
+ case MSR_K8_HWCR:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+ break;
+ if ( likely(!is_cpufreq_controller(currd)) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_AMD64_NB_CFG:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+ return X86EMUL_OKAY;
+ if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+ ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+ goto invalid;
+ if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+ return X86EMUL_OKAY;
+ if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+ break;
+ if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+ temp != val :
+ ((temp ^ val) &
+ ~(FAM10H_MMIO_CONF_ENABLE |
+ (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+ FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+ ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+ FAM10H_MMIO_CONF_BASE_SHIFT))) )
+ goto invalid;
+ if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_IA32_UCODE_REV:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+ return X86EMUL_OKAY;
+ if ( rdmsr_safe(reg, temp) )
+ break;
+ if ( val )
+ goto invalid;
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_MISC_ENABLE:
+ if ( rdmsr_safe(reg, temp) )
+ break;
+ if ( val != guest_misc_enable(temp) )
+ goto invalid;
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_MPERF:
+ case MSR_IA32_APERF:
+ if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+ break;
+ if ( likely(!is_cpufreq_controller(currd)) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_IA32_PERF_CTL:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ break;
+ if ( likely(!is_cpufreq_controller(currd)) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_IA32_THERM_CONTROL:
+ case MSR_IA32_ENERGY_PERF_BIAS:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ break;
+ if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+ wrmsr_safe(reg, val) == 0 )
+ return X86EMUL_OKAY;
+ break;
+
+ case MSR_AMD64_DR0_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+ break;
+ curr->arch.pv_vcpu.dr_mask[0] = val;
+ if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+ wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+ return X86EMUL_OKAY;
+
+ case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+ if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+ break;
+ curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+ if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+ wrmsrl(reg, val);
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_PLATFORM_INFO:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
+ break;
+ return X86EMUL_OKAY;
+
+ case MSR_INTEL_MISC_FEATURES_ENABLES:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
+ rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
+ break;
+ if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
+ !this_cpu(cpuid_faulting_enabled) )
+ break;
+ curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
+ return X86EMUL_OKAY;
+
+ case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+ case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+ case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+ case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ {
+ vpmu_msr = true;
+ case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+ case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+ if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+ {
+ if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+ !is_hardware_domain(currd) )
+ return X86EMUL_OKAY;
+
+ if ( vpmu_do_wrmsr(reg, val, 0) )
+ break;
+ return X86EMUL_OKAY;
+ }
+ }
+ /* fall through */
+ default:
+ if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+ return X86EMUL_OKAY;
+
+ rc = vmce_wrmsr(reg, val);
+ if ( rc < 0 )
+ break;
+ if ( rc )
+ return X86EMUL_OKAY;
+
+ if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+ invalid:
+ gdprintk(XENLOG_WARNING,
+ "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+ reg, temp, val);
+ return X86EMUL_OKAY;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ /* Ignore the instruction if unprivileged. */
+ if ( !cache_flush_permitted(current->domain) )
+ /*
+ * Non-physdev domain attempted WBINVD; ignore for now since
+ * newer linux uses this in some start-of-day timing loops.
+ */
+ ;
+ else
+ wbinvd();
+
+ return X86EMUL_OKAY;
+}
+
+int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
+ struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
+{
+ guest_cpuid(current, leaf, subleaf, res);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_validate(const struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt)
+{
+ switch ( ctxt->opcode )
+ {
+ case 0x6c ... 0x6f: /* ins / outs */
+ case 0xe4 ... 0xe7: /* in / out (immediate port) */
+ case 0xec ... 0xef: /* in / out (port in %dx) */
+ case X86EMUL_OPC(0x0f, 0x06): /* clts */
+ case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+ case X86EMUL_OPC(0x0f, 0x20) ...
+ X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+ case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+ case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+ case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+ case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+ return X86EMUL_OKAY;
+
+ case 0xfa: case 0xfb: /* cli / sti */
+ if ( !iopl_ok(current, ctxt->regs) )
+ break;
+ /*
+ * This is just too dangerous to allow, in my opinion. Consider if the
+ * caller then tries to reenable interrupts using POPF: we can't trap
+ * that and we'll end up with hard-to-debug lockups. Fast & loose will
+ * do for us. :-)
+ vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
+ */
+ return X86EMUL_DONE;
+
+ case X86EMUL_OPC(0x0f, 0x01):
+ {
+ unsigned int modrm_rm, modrm_reg;
+
+ if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+ (modrm_rm & 7) != 1 )
+ break;
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* xsetbv */
+ case 7: /* rdtscp */
+ return X86EMUL_OKAY;
+ }
+ break;
+ }
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct priv_op_ctxt *poc =
+ container_of(ctxt, struct priv_op_ctxt, ctxt);
+ unsigned int rc;
+ unsigned long addr = poc->cs.base + offset;
+
+ ASSERT(seg == x86_seg_cs);
+
+ /* We don't mean to emulate any branches. */
+ if ( !bytes )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+ x86_seg_cs, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+ {
+ /*
+ * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+ * cpu_has_nx, but we'd then need a "fetch" variant of
+ * __copy_from_user() respecting NX, SMEP, and protection keys.
+ */
+ x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+
+static const struct x86_emulate_ops priv_op_ops = {
+ .insn_fetch = priv_op_insn_fetch,
+ .read = x86emul_unhandleable_rw,
+ .validate = priv_op_validate,
+ .read_io = priv_op_read_io,
+ .write_io = priv_op_write_io,
+ .rep_ins = priv_op_rep_ins,
+ .rep_outs = priv_op_rep_outs,
+ .read_segment = priv_op_read_segment,
+ .read_cr = priv_op_read_cr,
+ .write_cr = priv_op_write_cr,
+ .read_dr = priv_op_read_dr,
+ .write_dr = priv_op_write_dr,
+ .read_msr = priv_op_read_msr,
+ .write_msr = priv_op_write_msr,
+ .cpuid = pv_emul_cpuid,
+ .wbinvd = priv_op_wbinvd,
+};
+
+int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+ struct vcpu *curr = current;
+ struct domain *currd = curr->domain;
+ struct priv_op_ctxt ctxt = {
+ .ctxt.regs = regs,
+ .ctxt.vendor = currd->arch.cpuid->x86_vendor,
+ .ctxt.lma = !is_pv_32bit_domain(currd),
+ };
+ int rc;
+ unsigned int eflags, ar;
+
+ if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 1) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ return 0;
+
+ /* Mirror virtualized state into EFLAGS. */
+ ASSERT(regs->eflags & X86_EFLAGS_IF);
+ if ( vcpu_info(curr, evtchn_upcall_mask) )
+ regs->eflags &= ~X86_EFLAGS_IF;
+ else
+ regs->eflags |= X86_EFLAGS_IF;
+ ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
+ regs->eflags |= curr->arch.pv_vcpu.iopl;
+ eflags = regs->eflags;
+
+ ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+ rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
+
+ if ( ctxt.io_emul_stub )
+ unmap_domain_page(ctxt.io_emul_stub);
+
+ /*
+ * Un-mirror virtualized state from EFLAGS.
+ * Nothing we allow to be emulated can change anything other than the
+ * arithmetic bits, and the resume flag.
+ */
+ ASSERT(!((regs->eflags ^ eflags) &
+ ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
+ regs->eflags |= X86_EFLAGS_IF;
+ regs->eflags &= ~X86_EFLAGS_IOPL;
+
+ switch ( rc )
+ {
+ case X86EMUL_OKAY:
+ if ( ctxt.tsc & TSC_BASE )
+ {
+ if ( ctxt.tsc & TSC_AUX )
+ pv_soft_rdtsc(curr, regs, 1);
+ else if ( currd->arch.vtsc )
+ pv_soft_rdtsc(curr, regs, 0);
+ else
+ msr_split(regs, rdtsc());
+ }
+
+ if ( ctxt.ctxt.retire.singlestep )
+ ctxt.bpmatch |= DR_STEP;
+ if ( ctxt.bpmatch )
+ {
+ curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+ if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+ pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
+ }
+ /* fall through */
+ case X86EMUL_RETRY:
+ return EXCRET_fault_fixed;
+
+ case X86EMUL_EXCEPTION:
+ pv_inject_event(&ctxt.ctxt.event);
+ return EXCRET_fault_fixed;
+ }
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index cd8ca20398..cd43e9f44c 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -78,6 +78,8 @@
#include <asm/cpuid.h>
#include <xsm/xsm.h>
+#include <asm/pv/traps.h>
+
/*
* opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
* fatal: Xen prints diagnostic message and then hangs.
@@ -705,41 +707,6 @@ static void instruction_done(struct cpu_user_regs *regs, unsigned long rip)
}
}
-static unsigned int check_guest_io_breakpoint(struct vcpu *v,
- unsigned int port, unsigned int len)
-{
- unsigned int width, i, match = 0;
- unsigned long start;
-
- if ( !(v->arch.debugreg[5]) ||
- !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
- return 0;
-
- for ( i = 0; i < 4; i++ )
- {
- if ( !(v->arch.debugreg[5] &
- (3 << (i * DR_ENABLE_SIZE))) )
- continue;
-
- start = v->arch.debugreg[i];
- width = 0;
-
- switch ( (v->arch.debugreg[7] >>
- (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
- {
- case DR_LEN_1: width = 1; break;
- case DR_LEN_2: width = 2; break;
- case DR_LEN_4: width = 4; break;
- case DR_LEN_8: width = 8; break;
- }
-
- if ( (start < (port + len)) && ((start + width) > port) )
- match |= 1 << i;
- }
-
- return match;
-}
-
/*
* Called from asm to set up the MCE trapbounce info.
* Returns 0 if no callback is set up, else 1.
@@ -1733,1327 +1700,6 @@ static int read_gate_descriptor(unsigned int gate_sel,
return 1;
}
-static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
- unsigned int bytes, unsigned long limit,
- enum x86_segment seg,
- struct x86_emulate_ctxt *ctxt,
- unsigned long *addr)
-{
- int rc = X86EMUL_OKAY;
-
- *addr = base + offset;
-
- if ( ctxt->addr_size < 64 )
- {
- if ( limit < bytes - 1 || offset > limit - bytes + 1 )
- rc = X86EMUL_EXCEPTION;
- *addr = (uint32_t)*addr;
- }
- else if ( !__addr_ok(*addr) )
- rc = X86EMUL_EXCEPTION;
-
- if ( unlikely(rc == X86EMUL_EXCEPTION) )
- x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
- : TRAP_stack_error,
- 0, ctxt);
-
- return rc;
-}
-
-struct priv_op_ctxt {
- struct x86_emulate_ctxt ctxt;
- struct {
- unsigned long base, limit;
- } cs;
- char *io_emul_stub;
- unsigned int bpmatch;
- unsigned int tsc;
-#define TSC_BASE 1
-#define TSC_AUX 2
-};
-
-static int priv_op_insn_fetch(enum x86_segment seg,
- unsigned long offset,
- void *p_data,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
-{
- const struct priv_op_ctxt *poc =
- container_of(ctxt, struct priv_op_ctxt, ctxt);
- unsigned int rc;
- unsigned long addr = poc->cs.base + offset;
-
- ASSERT(seg == x86_seg_cs);
-
- /* We don't mean to emulate any branches. */
- if ( !bytes )
- return X86EMUL_UNHANDLEABLE;
-
- rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
- x86_seg_cs, ctxt, &addr);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
- {
- /*
- * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
- * cpu_has_nx, but we'd then need a "fetch" variant of
- * __copy_from_user() respecting NX, SMEP, and protection keys.
- */
- x86_emul_pagefault(0, addr + bytes - rc, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_read_segment(enum x86_segment seg,
- struct segment_register *reg,
- struct x86_emulate_ctxt *ctxt)
-{
- /* Check if this is an attempt to access the I/O bitmap. */
- if ( seg == x86_seg_tr )
- {
- switch ( ctxt->opcode )
- {
- case 0x6c ... 0x6f: /* ins / outs */
- case 0xe4 ... 0xe7: /* in / out (immediate port) */
- case 0xec ... 0xef: /* in / out (port in %dx) */
- /* Defer the check to priv_op_{read,write}_io(). */
- return X86EMUL_DONE;
- }
- }
-
- if ( ctxt->addr_size < 64 )
- {
- unsigned long limit;
- unsigned int sel, ar;
-
- switch ( seg )
- {
- case x86_seg_cs: sel = ctxt->regs->cs; break;
- case x86_seg_ds: sel = read_sreg(ds); break;
- case x86_seg_es: sel = read_sreg(es); break;
- case x86_seg_fs: sel = read_sreg(fs); break;
- case x86_seg_gs: sel = read_sreg(gs); break;
- case x86_seg_ss: sel = ctxt->regs->ss; break;
- default: return X86EMUL_UNHANDLEABLE;
- }
-
- if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) )
- return X86EMUL_UNHANDLEABLE;
-
- reg->limit = limit;
- reg->attr.bytes = ar >> 8;
- }
- else
- {
- switch ( seg )
- {
- default:
- if ( !is_x86_user_segment(seg) )
- return X86EMUL_UNHANDLEABLE;
- reg->base = 0;
- break;
- case x86_seg_fs:
- reg->base = rdfsbase();
- break;
- case x86_seg_gs:
- reg->base = rdgsbase();
- break;
- }
-
- reg->limit = ~0U;
-
- reg->attr.bytes = 0;
- reg->attr.fields.type = _SEGMENT_WR >> 8;
- if ( seg == x86_seg_cs )
- {
- reg->attr.fields.type |= _SEGMENT_CODE >> 8;
- reg->attr.fields.l = 1;
- }
- else
- reg->attr.fields.db = 1;
- reg->attr.fields.s = 1;
- reg->attr.fields.dpl = 3;
- reg->attr.fields.p = 1;
- reg->attr.fields.g = 1;
- }
-
- /*
- * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
- * Also do this for consistency for non-conforming code segments.
- */
- if ( (seg == x86_seg_ss ||
- (seg == x86_seg_cs &&
- !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
- guest_kernel_mode(current, ctxt->regs) )
- reg->attr.fields.dpl = 0;
-
- return X86EMUL_OKAY;
-}
-
-/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
-static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
-{
- unsigned int cpl = guest_kernel_mode(v, regs) ?
- (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
-
- ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
-
- return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
-}
-
-/* Has the guest requested sufficient permission for this I/O access? */
-static int guest_io_okay(
- unsigned int port, unsigned int bytes,
- struct vcpu *v, struct cpu_user_regs *regs)
-{
- /* If in user mode, switch to kernel mode just to read I/O bitmap. */
- int user_mode = !(v->arch.flags & TF_kernel_mode);
-#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
-
- if ( iopl_ok(v, regs) )
- return 1;
-
- if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
- {
- union { uint8_t bytes[2]; uint16_t mask; } x;
-
- /*
- * Grab permission bytes from guest space. Inaccessible bytes are
- * read as 0xff (no access allowed).
- */
- TOGGLE_MODE();
- switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
- port>>3, 2) )
- {
- default: x.bytes[0] = ~0;
- /* fallthrough */
- case 1: x.bytes[1] = ~0;
- /* fallthrough */
- case 0: break;
- }
- TOGGLE_MODE();
-
- if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
- return 1;
- }
-
- return 0;
-}
-
-/* Has the administrator granted sufficient permission for this I/O access? */
-static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
- const struct domain *d)
-{
- /*
- * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
- * We never permit direct access to that register.
- */
- if ( (port == 0xcf8) && (bytes == 4) )
- return 0;
-
- /* We also never permit direct access to the RTC/CMOS registers. */
- if ( ((port & ~1) == RTC_PORT(0)) )
- return 0;
-
- return ioports_access_permitted(d, port, port + bytes - 1);
-}
-
-static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
- unsigned int size, uint32_t *write)
-{
- uint32_t machine_bdf;
-
- if ( !is_hardware_domain(currd) )
- return 0;
-
- if ( !CF8_ENABLED(currd->arch.pci_cf8) )
- return 1;
-
- machine_bdf = CF8_BDF(currd->arch.pci_cf8);
- if ( write )
- {
- const unsigned long *ro_map = pci_get_ro_map(0);
-
- if ( ro_map && test_bit(machine_bdf, ro_map) )
- return 0;
- }
- start |= CF8_ADDR_LO(currd->arch.pci_cf8);
- /* AMD extended configuration space access? */
- if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
- boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
- {
- uint64_t msr_val;
-
- if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
- return 0;
- if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
- start |= CF8_ADDR_HI(currd->arch.pci_cf8);
- }
-
- return !write ?
- xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
- start, start + size - 1, 0) == 0 :
- pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
-}
-
-uint32_t guest_io_read(unsigned int port, unsigned int bytes,
- struct domain *currd)
-{
- uint32_t data = 0;
- unsigned int shift = 0;
-
- if ( admin_io_okay(port, bytes, currd) )
- {
- switch ( bytes )
- {
- case 1: return inb(port);
- case 2: return inw(port);
- case 4: return inl(port);
- }
- }
-
- while ( bytes != 0 )
- {
- unsigned int size = 1;
- uint32_t sub_data = ~0;
-
- if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
- {
- sub_data = pv_pit_handler(port, 0, 0);
- }
- else if ( port == RTC_PORT(0) )
- {
- sub_data = currd->arch.cmos_idx;
- }
- else if ( (port == RTC_PORT(1)) &&
- ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
- {
- unsigned long flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
- outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
- sub_data = inb(RTC_PORT(1));
- spin_unlock_irqrestore(&rtc_lock, flags);
- }
- else if ( (port == 0xcf8) && (bytes == 4) )
- {
- size = 4;
- sub_data = currd->arch.pci_cf8;
- }
- else if ( (port & 0xfffc) == 0xcfc )
- {
- size = min(bytes, 4 - (port & 3));
- if ( size == 3 )
- size = 2;
- if ( pci_cfg_ok(currd, port & 3, size, NULL) )
- sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
- }
-
- if ( size == 4 )
- return sub_data;
-
- data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
- shift += size * 8;
- port += size;
- bytes -= size;
- }
-
- return data;
-}
-
-void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
- struct domain *currd)
-{
- if ( admin_io_okay(port, bytes, currd) )
- {
- switch ( bytes ) {
- case 1:
- outb((uint8_t)data, port);
- if ( pv_post_outb_hook )
- pv_post_outb_hook(port, (uint8_t)data);
- break;
- case 2:
- outw((uint16_t)data, port);
- break;
- case 4:
- outl(data, port);
- break;
- }
- return;
- }
-
- while ( bytes != 0 )
- {
- unsigned int size = 1;
-
- if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
- {
- pv_pit_handler(port, (uint8_t)data, 1);
- }
- else if ( port == RTC_PORT(0) )
- {
- currd->arch.cmos_idx = data;
- }
- else if ( (port == RTC_PORT(1)) &&
- ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
- {
- unsigned long flags;
-
- if ( pv_rtc_handler )
- pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
- spin_lock_irqsave(&rtc_lock, flags);
- outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
- outb(data, RTC_PORT(1));
- spin_unlock_irqrestore(&rtc_lock, flags);
- }
- else if ( (port == 0xcf8) && (bytes == 4) )
- {
- size = 4;
- currd->arch.pci_cf8 = data;
- }
- else if ( (port & 0xfffc) == 0xcfc )
- {
- size = min(bytes, 4 - (port & 3));
- if ( size == 3 )
- size = 2;
- if ( pci_cfg_ok(currd, port & 3, size, &data) )
- pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
- }
-
- if ( size == 4 )
- return;
-
- port += size;
- bytes -= size;
- data >>= size * 8;
- }
-}
-
-/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
-void host_to_guest_gpr_switch(struct cpu_user_regs *);
-unsigned long guest_to_host_gpr_switch(unsigned long);
-
-void (*pv_post_outb_hook)(unsigned int port, u8 value);
-
-typedef void io_emul_stub_t(struct cpu_user_regs *);
-
-static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
- unsigned int port, unsigned int bytes)
-{
- if ( !ctxt->io_emul_stub )
- ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
- (this_cpu(stubs.addr) &
- ~PAGE_MASK) +
- STUB_BUF_SIZE / 2;
-
- /* movq $host_to_guest_gpr_switch,%rcx */
- ctxt->io_emul_stub[0] = 0x48;
- ctxt->io_emul_stub[1] = 0xb9;
- *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
- /* callq *%rcx */
- ctxt->io_emul_stub[10] = 0xff;
- ctxt->io_emul_stub[11] = 0xd1;
- /* data16 or nop */
- ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
- /* <io-access opcode> */
- ctxt->io_emul_stub[13] = opcode;
- /* imm8 or nop */
- ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
- /* ret (jumps to guest_to_host_gpr_switch) */
- ctxt->io_emul_stub[15] = 0xc3;
- BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
- if ( ioemul_handle_quirk )
- ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
-
- /* Handy function-typed pointer to the stub. */
- return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-}
-
-static int priv_op_read_io(unsigned int port, unsigned int bytes,
- unsigned long *val, struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
-
- /* INS must not come here. */
- ASSERT((ctxt->opcode & ~9) == 0xe4);
-
- if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
- if ( admin_io_okay(port, bytes, currd) )
- {
- io_emul_stub_t *io_emul =
- io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
- mark_regs_dirty(ctxt->regs);
- io_emul(ctxt->regs);
- return X86EMUL_DONE;
- }
-
- *val = guest_io_read(port, bytes, currd);
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_write_io(unsigned int port, unsigned int bytes,
- unsigned long val, struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
-
- /* OUTS must not come here. */
- ASSERT((ctxt->opcode & ~9) == 0xe6);
-
- if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
- if ( admin_io_okay(port, bytes, currd) )
- {
- io_emul_stub_t *io_emul =
- io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
- mark_regs_dirty(ctxt->regs);
- io_emul(ctxt->regs);
- if ( (bytes == 1) && pv_post_outb_hook )
- pv_post_outb_hook(port, val);
- return X86EMUL_DONE;
- }
-
- guest_io_write(port, bytes, val, currd);
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_ins(uint16_t port,
- enum x86_segment seg, unsigned long offset,
- unsigned int bytes_per_rep, unsigned long *reps,
- struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
- unsigned long goal = *reps;
- struct segment_register sreg;
- int rc;
-
- ASSERT(seg == x86_seg_es);
-
- *reps = 0;
-
- if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( !sreg.attr.fields.p )
- return X86EMUL_UNHANDLEABLE;
- if ( !sreg.attr.fields.s ||
- (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
- !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
- {
- x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
- while ( *reps < goal )
- {
- unsigned int data = guest_io_read(port, bytes_per_rep, currd);
- unsigned long addr;
-
- rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
- sreg.limit, x86_seg_es, ctxt, &addr);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
- {
- x86_emul_pagefault(PFEC_write_access,
- addr + bytes_per_rep - rc, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- ++*reps;
-
- if ( poc->bpmatch || hypercall_preempt_check() )
- break;
-
- /* x86_emulate() clips the repetition count to ensure we don't wrap. */
- if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
- offset -= bytes_per_rep;
- else
- offset += bytes_per_rep;
- }
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
- uint16_t port,
- unsigned int bytes_per_rep, unsigned long *reps,
- struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- struct vcpu *curr = current;
- struct domain *currd = current->domain;
- unsigned long goal = *reps;
- struct segment_register sreg;
- int rc;
-
- *reps = 0;
-
- if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
- return X86EMUL_UNHANDLEABLE;
-
- rc = priv_op_read_segment(seg, &sreg, ctxt);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( !sreg.attr.fields.p )
- return X86EMUL_UNHANDLEABLE;
- if ( !sreg.attr.fields.s ||
- ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
- !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
- {
- x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
- : TRAP_stack_error,
- 0, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
- while ( *reps < goal )
- {
- unsigned int data = 0;
- unsigned long addr;
-
- rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
- sreg.limit, seg, ctxt, &addr);
- if ( rc != X86EMUL_OKAY )
- return rc;
-
- if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
- {
- x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
- return X86EMUL_EXCEPTION;
- }
-
- guest_io_write(port, bytes_per_rep, data, currd);
-
- ++*reps;
-
- if ( poc->bpmatch || hypercall_preempt_check() )
- break;
-
- /* x86_emulate() clips the repetition count to ensure we don't wrap. */
- if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
- offset -= bytes_per_rep;
- else
- offset += bytes_per_rep;
- }
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_read_cr(unsigned int reg, unsigned long *val,
- struct x86_emulate_ctxt *ctxt)
-{
- const struct vcpu *curr = current;
-
- switch ( reg )
- {
- case 0: /* Read CR0 */
- *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
- return X86EMUL_OKAY;
-
- case 2: /* Read CR2 */
- case 4: /* Read CR4 */
- *val = curr->arch.pv_vcpu.ctrlreg[reg];
- return X86EMUL_OKAY;
-
- case 3: /* Read CR3 */
- {
- const struct domain *currd = curr->domain;
- unsigned long mfn;
-
- if ( !is_pv_32bit_domain(currd) )
- {
- mfn = pagetable_get_pfn(curr->arch.guest_table);
- *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
- }
- else
- {
- l4_pgentry_t *pl4e =
- map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
-
- mfn = l4e_get_pfn(*pl4e);
- unmap_domain_page(pl4e);
- *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
- }
- /* PTs should not be shared */
- BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
- return X86EMUL_OKAY;
- }
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_write_cr(unsigned int reg, unsigned long val,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *curr = current;
-
- switch ( reg )
- {
- case 0: /* Write CR0 */
- if ( (val ^ read_cr0()) & ~X86_CR0_TS )
- {
- gdprintk(XENLOG_WARNING,
- "Attempt to change unmodifiable CR0 flags\n");
- break;
- }
- do_fpu_taskswitch(!!(val & X86_CR0_TS));
- return X86EMUL_OKAY;
-
- case 2: /* Write CR2 */
- curr->arch.pv_vcpu.ctrlreg[2] = val;
- arch_set_cr2(curr, val);
- return X86EMUL_OKAY;
-
- case 3: /* Write CR3 */
- {
- struct domain *currd = curr->domain;
- unsigned long gfn;
- struct page_info *page;
- int rc;
-
- gfn = !is_pv_32bit_domain(currd)
- ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
- page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
- if ( !page )
- break;
- rc = new_guest_cr3(page_to_mfn(page));
- put_page(page);
-
- switch ( rc )
- {
- case 0:
- return X86EMUL_OKAY;
- case -ERESTART: /* retry after preemption */
- return X86EMUL_RETRY;
- }
- break;
- }
-
- case 4: /* Write CR4 */
- curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
- write_cr4(pv_guest_cr4_to_real_cr4(curr));
- ctxt_switch_levelling(curr);
- return X86EMUL_OKAY;
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_read_dr(unsigned int reg, unsigned long *val,
- struct x86_emulate_ctxt *ctxt)
-{
- unsigned long res = do_get_debugreg(reg);
-
- if ( IS_ERR_VALUE(res) )
- return X86EMUL_UNHANDLEABLE;
-
- *val = res;
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_write_dr(unsigned int reg, unsigned long val,
- struct x86_emulate_ctxt *ctxt)
-{
- return do_set_debugreg(reg, val) == 0
- ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
-}
-
-static inline uint64_t guest_misc_enable(uint64_t val)
-{
- val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
- MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
- val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
- MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
- MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
- return val;
-}
-
-static inline bool is_cpufreq_controller(const struct domain *d)
-{
- return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
- is_hardware_domain(d));
-}
-
-static int priv_op_read_msr(unsigned int reg, uint64_t *val,
- struct x86_emulate_ctxt *ctxt)
-{
- struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
- const struct vcpu *curr = current;
- const struct domain *currd = curr->domain;
- bool vpmu_msr = false;
-
- switch ( reg )
- {
- int rc;
-
- case MSR_FS_BASE:
- if ( is_pv_32bit_domain(currd) )
- break;
- *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
- return X86EMUL_OKAY;
-
- case MSR_GS_BASE:
- if ( is_pv_32bit_domain(currd) )
- break;
- *val = cpu_has_fsgsbase ? __rdgsbase()
- : curr->arch.pv_vcpu.gs_base_kernel;
- return X86EMUL_OKAY;
-
- case MSR_SHADOW_GS_BASE:
- if ( is_pv_32bit_domain(currd) )
- break;
- *val = curr->arch.pv_vcpu.gs_base_user;
- return X86EMUL_OKAY;
-
- /*
- * In order to fully retain original behavior, defer calling
- * pv_soft_rdtsc() until after emulation. This may want/need to be
- * reconsidered.
- */
- case MSR_IA32_TSC:
- poc->tsc |= TSC_BASE;
- goto normal;
-
- case MSR_TSC_AUX:
- poc->tsc |= TSC_AUX;
- if ( cpu_has_rdtscp )
- goto normal;
- *val = 0;
- return X86EMUL_OKAY;
-
- case MSR_EFER:
- *val = read_efer();
- if ( is_pv_32bit_domain(currd) )
- *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
- return X86EMUL_OKAY;
-
- case MSR_K7_FID_VID_CTL:
- case MSR_K7_FID_VID_STATUS:
- case MSR_K8_PSTATE_LIMIT:
- case MSR_K8_PSTATE_CTRL:
- case MSR_K8_PSTATE_STATUS:
- case MSR_K8_PSTATE0:
- case MSR_K8_PSTATE1:
- case MSR_K8_PSTATE2:
- case MSR_K8_PSTATE3:
- case MSR_K8_PSTATE4:
- case MSR_K8_PSTATE5:
- case MSR_K8_PSTATE6:
- case MSR_K8_PSTATE7:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
- break;
- if ( unlikely(is_cpufreq_controller(currd)) )
- goto normal;
- *val = 0;
- return X86EMUL_OKAY;
-
- case MSR_IA32_UCODE_REV:
- BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
- {
- if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
- break;
- /* As documented in the SDM: Do a CPUID 1 here */
- cpuid_eax(1);
- }
- goto normal;
-
- case MSR_IA32_MISC_ENABLE:
- if ( rdmsr_safe(reg, *val) )
- break;
- *val = guest_misc_enable(*val);
- return X86EMUL_OKAY;
-
- case MSR_AMD64_DR0_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
- break;
- *val = curr->arch.pv_vcpu.dr_mask[0];
- return X86EMUL_OKAY;
-
- case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
- break;
- *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
- return X86EMUL_OKAY;
-
- case MSR_IA32_PERF_CAPABILITIES:
- /* No extra capabilities are supported. */
- *val = 0;
- return X86EMUL_OKAY;
-
- case MSR_INTEL_PLATFORM_INFO:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
- break;
- *val = 0;
- if ( this_cpu(cpuid_faulting_enabled) )
- *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
- return X86EMUL_OKAY;
-
- case MSR_INTEL_MISC_FEATURES_ENABLES:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
- break;
- *val = 0;
- if ( curr->arch.cpuid_faulting )
- *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
- return X86EMUL_OKAY;
-
- case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
- case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
- case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
- case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
- {
- vpmu_msr = true;
- /* fall through */
- case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
- case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
- if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
- {
- if ( vpmu_do_rdmsr(reg, val) )
- break;
- return X86EMUL_OKAY;
- }
- }
- /* fall through */
- default:
- if ( rdmsr_hypervisor_regs(reg, val) )
- return X86EMUL_OKAY;
-
- rc = vmce_rdmsr(reg, val);
- if ( rc < 0 )
- break;
- if ( rc )
- return X86EMUL_OKAY;
- /* fall through */
- normal:
- /* Everyone can read the MSR space. */
- /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
- if ( rdmsr_safe(reg, *val) )
- break;
- return X86EMUL_OKAY;
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-#include "x86_64/mmconfig.h"
-
-static int priv_op_write_msr(unsigned int reg, uint64_t val,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *curr = current;
- const struct domain *currd = curr->domain;
- bool vpmu_msr = false;
-
- switch ( reg )
- {
- uint64_t temp;
- int rc;
-
- case MSR_FS_BASE:
- if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
- break;
- wrfsbase(val);
- curr->arch.pv_vcpu.fs_base = val;
- return X86EMUL_OKAY;
-
- case MSR_GS_BASE:
- if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
- break;
- wrgsbase(val);
- curr->arch.pv_vcpu.gs_base_kernel = val;
- return X86EMUL_OKAY;
-
- case MSR_SHADOW_GS_BASE:
- if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
- break;
- wrmsrl(MSR_SHADOW_GS_BASE, val);
- curr->arch.pv_vcpu.gs_base_user = val;
- return X86EMUL_OKAY;
-
- case MSR_K7_FID_VID_STATUS:
- case MSR_K7_FID_VID_CTL:
- case MSR_K8_PSTATE_LIMIT:
- case MSR_K8_PSTATE_CTRL:
- case MSR_K8_PSTATE_STATUS:
- case MSR_K8_PSTATE0:
- case MSR_K8_PSTATE1:
- case MSR_K8_PSTATE2:
- case MSR_K8_PSTATE3:
- case MSR_K8_PSTATE4:
- case MSR_K8_PSTATE5:
- case MSR_K8_PSTATE6:
- case MSR_K8_PSTATE7:
- case MSR_K8_HWCR:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
- break;
- if ( likely(!is_cpufreq_controller(currd)) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_AMD64_NB_CFG:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
- boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
- return X86EMUL_OKAY;
- if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
- ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
- goto invalid;
- if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_FAM10H_MMIO_CONF_BASE:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
- boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
- return X86EMUL_OKAY;
- if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
- break;
- if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
- temp != val :
- ((temp ^ val) &
- ~(FAM10H_MMIO_CONF_ENABLE |
- (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
- FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
- ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
- FAM10H_MMIO_CONF_BASE_SHIFT))) )
- goto invalid;
- if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_IA32_UCODE_REV:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
- return X86EMUL_OKAY;
- if ( rdmsr_safe(reg, temp) )
- break;
- if ( val )
- goto invalid;
- return X86EMUL_OKAY;
-
- case MSR_IA32_MISC_ENABLE:
- if ( rdmsr_safe(reg, temp) )
- break;
- if ( val != guest_misc_enable(temp) )
- goto invalid;
- return X86EMUL_OKAY;
-
- case MSR_IA32_MPERF:
- case MSR_IA32_APERF:
- if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
- break;
- if ( likely(!is_cpufreq_controller(currd)) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_IA32_PERF_CTL:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
- break;
- if ( likely(!is_cpufreq_controller(currd)) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_IA32_THERM_CONTROL:
- case MSR_IA32_ENERGY_PERF_BIAS:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
- break;
- if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
- wrmsr_safe(reg, val) == 0 )
- return X86EMUL_OKAY;
- break;
-
- case MSR_AMD64_DR0_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
- break;
- curr->arch.pv_vcpu.dr_mask[0] = val;
- if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
- wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
- return X86EMUL_OKAY;
-
- case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
- break;
- curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
- if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
- wrmsrl(reg, val);
- return X86EMUL_OKAY;
-
- case MSR_INTEL_PLATFORM_INFO:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
- break;
- return X86EMUL_OKAY;
-
- case MSR_INTEL_MISC_FEATURES_ENABLES:
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
- rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
- break;
- if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
- !this_cpu(cpuid_faulting_enabled) )
- break;
- curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
- return X86EMUL_OKAY;
-
- case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
- case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
- case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
- case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
- {
- vpmu_msr = true;
- case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
- case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
- if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
- {
- if ( (vpmu_mode & XENPMU_MODE_ALL) &&
- !is_hardware_domain(currd) )
- return X86EMUL_OKAY;
-
- if ( vpmu_do_wrmsr(reg, val, 0) )
- break;
- return X86EMUL_OKAY;
- }
- }
- /* fall through */
- default:
- if ( wrmsr_hypervisor_regs(reg, val) == 1 )
- return X86EMUL_OKAY;
-
- rc = vmce_wrmsr(reg, val);
- if ( rc < 0 )
- break;
- if ( rc )
- return X86EMUL_OKAY;
-
- if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
- invalid:
- gdprintk(XENLOG_WARNING,
- "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
- reg, temp, val);
- return X86EMUL_OKAY;
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
-{
- /* Ignore the instruction if unprivileged. */
- if ( !cache_flush_permitted(current->domain) )
- /*
- * Non-physdev domain attempted WBINVD; ignore for now since
- * newer linux uses this in some start-of-day timing loops.
- */
- ;
- else
- wbinvd();
-
- return X86EMUL_OKAY;
-}
-
-int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
- struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
-{
- guest_cpuid(current, leaf, subleaf, res);
-
- return X86EMUL_OKAY;
-}
-
-static int priv_op_validate(const struct x86_emulate_state *state,
- struct x86_emulate_ctxt *ctxt)
-{
- switch ( ctxt->opcode )
- {
- case 0x6c ... 0x6f: /* ins / outs */
- case 0xe4 ... 0xe7: /* in / out (immediate port) */
- case 0xec ... 0xef: /* in / out (port in %dx) */
- case X86EMUL_OPC(0x0f, 0x06): /* clts */
- case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
- case X86EMUL_OPC(0x0f, 0x20) ...
- X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
- case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
- case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
- case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
- case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
- return X86EMUL_OKAY;
-
- case 0xfa: case 0xfb: /* cli / sti */
- if ( !iopl_ok(current, ctxt->regs) )
- break;
- /*
- * This is just too dangerous to allow, in my opinion. Consider if the
- * caller then tries to reenable interrupts using POPF: we can't trap
- * that and we'll end up with hard-to-debug lockups. Fast & loose will
- * do for us. :-)
- vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
- */
- return X86EMUL_DONE;
-
- case X86EMUL_OPC(0x0f, 0x01):
- {
- unsigned int modrm_rm, modrm_reg;
-
- if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
- (modrm_rm & 7) != 1 )
- break;
- switch ( modrm_reg & 7 )
- {
- case 2: /* xsetbv */
- case 7: /* rdtscp */
- return X86EMUL_OKAY;
- }
- break;
- }
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static const struct x86_emulate_ops priv_op_ops = {
- .insn_fetch = priv_op_insn_fetch,
- .read = x86emul_unhandleable_rw,
- .validate = priv_op_validate,
- .read_io = priv_op_read_io,
- .write_io = priv_op_write_io,
- .rep_ins = priv_op_rep_ins,
- .rep_outs = priv_op_rep_outs,
- .read_segment = priv_op_read_segment,
- .read_cr = priv_op_read_cr,
- .write_cr = priv_op_write_cr,
- .read_dr = priv_op_read_dr,
- .write_dr = priv_op_write_dr,
- .read_msr = priv_op_read_msr,
- .write_msr = priv_op_write_msr,
- .cpuid = pv_emul_cpuid,
- .wbinvd = priv_op_wbinvd,
-};
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
-{
- struct vcpu *curr = current;
- struct domain *currd = curr->domain;
- struct priv_op_ctxt ctxt = {
- .ctxt.regs = regs,
- .ctxt.vendor = currd->arch.cpuid->x86_vendor,
- .ctxt.lma = !is_pv_32bit_domain(currd),
- };
- int rc;
- unsigned int eflags, ar;
-
- if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
- &ar, 1) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- !(ar & _SEGMENT_CODE) )
- return 0;
-
- /* Mirror virtualized state into EFLAGS. */
- ASSERT(regs->eflags & X86_EFLAGS_IF);
- if ( vcpu_info(curr, evtchn_upcall_mask) )
- regs->eflags &= ~X86_EFLAGS_IF;
- else
- regs->eflags |= X86_EFLAGS_IF;
- ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
- regs->eflags |= curr->arch.pv_vcpu.iopl;
- eflags = regs->eflags;
-
- ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
- /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
- rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
-
- if ( ctxt.io_emul_stub )
- unmap_domain_page(ctxt.io_emul_stub);
-
- /*
- * Un-mirror virtualized state from EFLAGS.
- * Nothing we allow to be emulated can change anything other than the
- * arithmetic bits, and the resume flag.
- */
- ASSERT(!((regs->eflags ^ eflags) &
- ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
- regs->eflags |= X86_EFLAGS_IF;
- regs->eflags &= ~X86_EFLAGS_IOPL;
-
- switch ( rc )
- {
- case X86EMUL_OKAY:
- if ( ctxt.tsc & TSC_BASE )
- {
- if ( ctxt.tsc & TSC_AUX )
- pv_soft_rdtsc(curr, regs, 1);
- else if ( currd->arch.vtsc )
- pv_soft_rdtsc(curr, regs, 0);
- else
- msr_split(regs, rdtsc());
- }
-
- if ( ctxt.ctxt.retire.singlestep )
- ctxt.bpmatch |= DR_STEP;
- if ( ctxt.bpmatch )
- {
- curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
- if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
- pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
- }
- /* fall through */
- case X86EMUL_RETRY:
- return EXCRET_fault_fixed;
-
- case X86EMUL_EXCEPTION:
- pv_inject_event(&ctxt.ctxt.event);
- return EXCRET_fault_fixed;
- }
-
- return 0;
-}
-
static inline int check_stack_limit(unsigned int ar, unsigned int limit,
unsigned int esp, unsigned int decr)
{
diff --git a/xen/include/asm-x86/pv/traps.h b/xen/include/asm-x86/pv/traps.h
new file mode 100644
index 0000000000..32c7bac587
--- /dev/null
+++ b/xen/include/asm-x86/pv/traps.h
@@ -0,0 +1,48 @@
+/*
+ * pv/traps.h
+ *
+ * PV guest traps interface definitions
+ *
+ * Copyright (C) 2017 Wei Liu <wei.liu2@citrix.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __X86_PV_TRAPS_H__
+#define __X86_PV_TRAPS_H__
+
+#ifdef CONFIG_PV
+
+#include <public/xen.h>
+
+int emulate_privileged_op(struct cpu_user_regs *regs);
+
+#else /* !CONFIG_PV */
+
+#include <xen/errno.h>
+
+int emulate_privileged_op(struct cpu_user_regs *regs) { return -EOPNOTSUPP; }
+
+#endif /* CONFIG_PV */
+
+#endif /* __X86_PV_TRAPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--
2.11.0
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
next prev parent reply other threads:[~2017-05-18 17:10 UTC|newest]
Thread overview: 65+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-05-18 17:09 [PATCH for-next v3 00/22] x86: refactor trap handling code Wei Liu
2017-05-18 17:09 ` Wei Liu [this message]
2017-05-18 17:28 ` [PATCH for-next v3 01/22] x86/traps: move privilege instruction emulation code Wei Liu
2017-05-29 15:14 ` Jan Beulich
2017-05-30 17:27 ` Wei Liu
2017-05-30 17:30 ` Andrew Cooper
2017-05-31 5:55 ` Jan Beulich
2017-05-31 11:01 ` Wei Liu
2017-05-31 11:05 ` Andrew Cooper
2017-05-31 11:36 ` Wei Liu
2017-05-31 11:43 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 02/22] x86/traps: move gate op " Wei Liu
2017-05-29 15:15 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 03/22] x86/traps: move emulate_invalid_rdtscp Wei Liu
2017-05-29 15:18 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 04/22] x86/traps: move emulate_forced_invalid_op Wei Liu
2017-05-29 15:19 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 05/22] x86/pv: clean up emulate.c Wei Liu
2017-05-29 15:37 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 06/22] x86/traps: move PV hypercall handlers to pv/traps.c Wei Liu
2017-05-29 15:40 ` Jan Beulich
2017-05-30 17:40 ` Andrew Cooper
2017-05-31 5:59 ` Jan Beulich
2017-05-31 11:14 ` Wei Liu
2017-05-31 11:45 ` Jan Beulich
2017-06-02 11:01 ` Wei Liu
2017-06-06 7:36 ` Jan Beulich
2017-06-08 11:30 ` Andrew Cooper
2017-06-08 14:28 ` Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 07/22] x86/traps: move pv_inject_event " Wei Liu
2017-05-29 15:42 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 08/22] x86/traps: move set_guest_{machinecheck, nmi}_trapbounce Wei Liu
2017-05-29 15:43 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 09/22] x86/traps: move {un, }register_guest_nmi_callback Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 10/22] x86/traps: delcare percpu softirq_trap Wei Liu
2017-05-29 15:49 ` Jan Beulich
2017-05-31 11:35 ` Wei Liu
2017-05-31 11:46 ` Jan Beulich
2017-05-31 11:54 ` Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 11/22] x86/traps: move guest_has_trap_callback to pv/traps.c Wei Liu
2017-05-29 15:54 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 12/22] x86/traps: move send_guest_trap " Wei Liu
2017-05-29 15:55 ` Jan Beulich
2017-06-05 17:08 ` Wei Liu
2017-06-06 7:37 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 13/22] x86/traps: move toggle_guest_mode Wei Liu
2017-05-29 16:05 ` Jan Beulich
2017-05-30 17:47 ` Andrew Cooper
2017-05-31 6:00 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 14/22] x86/traps: move do_iret to pv/traps.c Wei Liu
2017-05-29 16:07 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 15/22] x86/traps: move init_int80_direct_trap Wei Liu
2017-05-29 16:07 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 16/22] x86/traps: move callback_op code Wei Liu
2017-05-29 16:09 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 17/22] x86/traps: move hypercall_page_initialise_ring3_kernel Wei Liu
2017-05-29 16:10 ` Jan Beulich
2017-05-18 17:10 ` [PATCH for-next v3 18/22] x86/traps: merge x86_64/compat/traps.c into pv/traps.c Wei Liu
2017-05-29 16:12 ` Jan Beulich
2017-05-18 17:10 ` [PATCH for-next v3 19/22] x86: clean up pv/traps.c Wei Liu
2017-05-29 16:18 ` Jan Beulich
2017-05-18 17:10 ` [PATCH for-next v3 20/22] x86: guest_has_trap_callback should return bool Wei Liu
2017-05-18 17:10 ` [PATCH for-next v3 21/22] x86: fix coding style issues in asm-x86/traps.h Wei Liu
2017-05-18 17:10 ` [PATCH for-next v3 22/22] x86: clean up traps.c Wei Liu
2017-05-29 16:21 ` Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170518171004.27204-2-wei.liu2@citrix.com \
--to=wei.liu2@citrix.com \
--cc=JBeulich@suse.com \
--cc=andrew.cooper3@citrix.com \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).