From mboxrd@z Thu Jan 1 00:00:00 1970 From: Mukesh Rathor Subject: [PATCH 13/20] PVH xen: introduce vmx_pvh.c Date: Tue, 14 May 2013 17:52:41 -0700 Message-ID: <1368579168-30829-14-git-send-email-mukesh.rathor@oracle.com> References: <1368579168-30829-1-git-send-email-mukesh.rathor@oracle.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1368579168-30829-1-git-send-email-mukesh.rathor@oracle.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Xen-devel@lists.xensource.com List-Id: xen-devel@lists.xenproject.org The heart of this patch is vmx exit handler for PVH guest. It is nicely isolated in a separate module as preferred by most of us. A call to it is added to vmx_pvh_vmexit_handler(). Changes in V2: - Move non VMX generic code to arch/x86/hvm/pvh.c - Remove get_gpr_ptr() and use existing decode_register() instead. - Defer call to pvh vmx exit handler until interrupts are enabled. So the caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now. - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set the correct feature bits in CR4 during vmcs creation. - Fix few hard tabs. Changes in V3: - Lot of cleanup and rework in PVH vm exit handler. - add parameter to emulate_forced_invalid_op(). Changes in V5: - Move pvh.c and emulate_forced_invalid_op related changes to another patch. - Formatting. - Remove vmx_pvh_read_descriptor(). - Use SS DPL instead of CS.RPL for CPL. - Remove pvh_user_cpuid() and call pv_cpuid for user mode also. Signed-off-by: Mukesh Rathor --- xen/arch/x86/hvm/vmx/Makefile | 1 + xen/arch/x86/hvm/vmx/vmx.c | 7 + xen/arch/x86/hvm/vmx/vmx_pvh.c | 502 +++++++++++++++++++++++++++++++++++++ xen/include/asm-x86/hvm/vmx/vmx.h | 2 + 4 files changed, 512 insertions(+), 0 deletions(-) create mode 100644 xen/arch/x86/hvm/vmx/vmx_pvh.c diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile index 373b3d9..8b71dae 100644 --- a/xen/arch/x86/hvm/vmx/Makefile +++ b/xen/arch/x86/hvm/vmx/Makefile @@ -5,3 +5,4 @@ obj-y += vmcs.o obj-y += vmx.o obj-y += vpmu_core2.o obj-y += vvmx.o +obj-y += vmx_pvh.o diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index bd4c8bd..338272c 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -1595,6 +1595,7 @@ static struct hvm_function_table __initdata vmx_function_table = { .deliver_posted_intr = vmx_deliver_posted_intr, .sync_pir_to_irr = vmx_sync_pir_to_irr, .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m, + .pvh_set_vcpu_info = vmx_pvh_set_vcpu_info, }; const struct hvm_function_table * __init start_vmx(void) @@ -2438,6 +2439,12 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) ) return vmx_failed_vmentry(exit_reason, regs); + if ( is_pvh_vcpu(v) ) + { + vmx_pvh_vmexit_handler(regs); + return; + } + if ( v->arch.hvm_vmx.vmx_realmode ) { /* Put RFLAGS back the way the guest wants it */ diff --git a/xen/arch/x86/hvm/vmx/vmx_pvh.c b/xen/arch/x86/hvm/vmx/vmx_pvh.c new file mode 100644 index 0000000..13a2813 --- /dev/null +++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef NDEBUG +int pvhdbg = 0; +#define dbgp1(...) do { (pvhdbg == 1) ? printk(__VA_ARGS__) : 0; } while ( 0 ) +#else +#define dbgp1(...) ((void)0) +#endif + + +static void read_vmcs_selectors(struct cpu_user_regs *regs) +{ + regs->cs = __vmread(GUEST_CS_SELECTOR); + regs->ss = __vmread(GUEST_SS_SELECTOR); + regs->ds = __vmread(GUEST_DS_SELECTOR); + regs->es = __vmread(GUEST_ES_SELECTOR); + regs->gs = __vmread(GUEST_GS_SELECTOR); + regs->fs = __vmread(GUEST_FS_SELECTOR); +} + +/* returns : 0 == msr read successfully */ +static int vmxit_msr_read(struct cpu_user_regs *regs) +{ + u64 msr_content = 0; + + switch ( regs->ecx ) + { + case MSR_IA32_MISC_ENABLE: + rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); + msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; + break; + + default: + /* pvh fixme: see hvm_msr_read_intercept() */ + rdmsrl(regs->ecx, msr_content); + break; + } + regs->eax = (uint32_t)msr_content; + regs->edx = (uint32_t)(msr_content >> 32); + vmx_update_guest_eip(); + + dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, regs->eax, + regs->edx, regs->rip, regs->rsp); + + return 0; +} + +/* returns : 0 == msr written successfully */ +static int vmxit_msr_write(struct cpu_user_regs *regs) +{ + uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32); + + dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx, + regs->eax, regs->edx); + + if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) + { + vmx_update_guest_eip(); + return 0; + } + return 1; +} + +static int vmxit_debug(struct cpu_user_regs *regs) +{ + struct vcpu *vp = current; + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); + + write_debugreg(6, exit_qualification | 0xffff0ff0); + + /* gdbsx or another debugger */ + if ( vp->domain->domain_id != 0 && /* never pause dom0 */ + guest_kernel_mode(vp, regs) && vp->domain->debugger_attached ) + + domain_pause_for_debugger(); + else + hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); + + return 0; +} + +/* Returns: rc == 0: handled the MTF vmexit */ +static int vmxit_mtf(struct cpu_user_regs *regs) +{ + struct vcpu *vp = current; + int rc = -EINVAL, ss = vp->arch.hvm_vcpu.single_step; + + vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control); + vp->arch.hvm_vcpu.single_step = 0; + + if ( vp->domain->debugger_attached && ss ) + { + domain_pause_for_debugger(); + rc = 0; + } + return rc; +} + +static int vmxit_int3(struct cpu_user_regs *regs) +{ + int ilen = vmx_get_instruction_length(); + struct vcpu *vp = current; + struct hvm_trap trap_info = { + .vector = TRAP_int3, + .type = X86_EVENTTYPE_SW_EXCEPTION, + .error_code = HVM_DELIVER_NO_ERROR_CODE, + .insn_len = ilen + }; + + /* gdbsx or another debugger. Never pause dom0 */ + if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp, regs) ) + { + regs->eip += ilen; + dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id()); + current->arch.gdbsx_vcpu_event = TRAP_int3; + domain_pause_for_debugger(); + return 0; + } + hvm_inject_trap(&trap_info); + + return 0; +} + +static int vmxit_invalid_op(struct cpu_user_regs *regs) +{ + if ( guest_kernel_mode(current, regs) || !emulate_forced_invalid_op(regs) ) + hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); + + return 0; +} + +/* Returns: rc == 0: handled the exception/NMI */ +static int vmxit_exception(struct cpu_user_regs *regs) +{ + unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) & INTR_INFO_VECTOR_MASK; + int rc = -ENOSYS; + + dbgp1(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, + __vmread(GUEST_CS_SELECTOR), regs->eip); + + switch ( vector ) + { + case TRAP_debug: + rc = vmxit_debug(regs); + break; + + case TRAP_int3: + rc = vmxit_int3(regs); + break; + + case TRAP_invalid_op: + rc = vmxit_invalid_op(regs); + break; + + case TRAP_no_device: + hvm_funcs.fpu_dirty_intercept(); + rc = 0; + break; + + default: + gdprintk(XENLOG_G_WARNING, + "PVH: Unhandled trap:%d. IP:%lx\n", vector, regs->eip); + } + return rc; +} + +static int vmxit_vmcall(struct cpu_user_regs *regs) +{ + if ( pvh_do_hypercall(regs) != HVM_HCALL_preempted ) + vmx_update_guest_eip(); + return 0; +} + +/* Returns: rc == 0: success */ +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, uint64_t *regp) +{ + struct vcpu *vp = current; + + if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) + { + unsigned long new_cr0 = *regp; + unsigned long old_cr0 = __vmread(GUEST_CR0); + + dbgp1("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp); + if ( (u32)new_cr0 != new_cr0 ) + { + gdprintk(XENLOG_G_WARNING, + "Guest setting upper 32 bits in CR0: %lx", new_cr0); + return -EPERM; + } + + new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS; + /* ET is reserved and should be always be 1. */ + new_cr0 |= X86_CR0_ET; + + /* pvh not expected to change to real mode */ + if ( (new_cr0 & (X86_CR0_PE | X86_CR0_PG)) != + (X86_CR0_PG | X86_CR0_PE) ) + { + gdprintk(XENLOG_G_WARNING, + "PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0); + return -EPERM; + } + /* TS going from 1 to 0 */ + if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS) == 0) ) + vmx_fpu_enter(vp); + + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0; + __vmwrite(GUEST_CR0, new_cr0); + __vmwrite(CR0_READ_SHADOW, new_cr0); + } + else + { + *regp = __vmread(GUEST_CR0); + } + return 0; +} + +/* Returns: rc == 0: success */ +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, uint64_t *regp) +{ + if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) + { + u64 old_cr4 = __vmread(GUEST_CR4); + u64 new = *regp; + + if ( (old_cr4 ^ new) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) + vpid_sync_all(); + + __vmwrite(CR4_READ_SHADOW, new); + + new &= ~X86_CR4_PAE; /* PVH always runs with hap enabled */ + new |= X86_CR4_VMXE | X86_CR4_MCE; + __vmwrite(GUEST_CR4, new); + } + else + *regp = __vmread(CR4_READ_SHADOW); + + return 0; +} + +/* Returns: rc == 0: success, else -errno */ +static int vmxit_cr_access(struct cpu_user_regs *regs) +{ + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); + uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification); + int cr, rc = -EINVAL; + + switch ( acc_typ ) + { + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: + case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR: + { + uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); + uint64_t *regp = decode_register(gpr, regs, 0); + cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification); + + if ( regp == NULL ) + break; + + switch ( cr ) + { + case 0: + rc = access_cr0(regs, acc_typ, regp); + break; + + case 3: + gdprintk(XENLOG_G_ERR, "PVH: unexpected cr3 vmexit. rip:%lx\n", + regs->rip); + domain_crash_synchronous(); + break; + + case 4: + rc = access_cr4(regs, acc_typ, regp); + break; + } + if ( rc == 0 ) + vmx_update_guest_eip(); + break; + } + + case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: + { + struct vcpu *vp = current; + unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS; + vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0; + + vmx_fpu_enter(vp); + __vmwrite(GUEST_CR0, cr0); + __vmwrite(CR0_READ_SHADOW, cr0); + vmx_update_guest_eip(); + rc = 0; + } + } + return rc; +} + +/* + * NOTE: a PVH guest sets IOPL natively by setting bits in the eflags, and not + * by hypercalls used by a PV. + */ +static int vmxit_io_instr(struct cpu_user_regs *regs) +{ + struct segment_register seg; + int requested = (regs->rflags & X86_EFLAGS_IOPL) >> 12; + int curr_lvl = (regs->rflags & X86_EFLAGS_VM) ? 3 : 0; + + if ( curr_lvl == 0 ) + { + hvm_get_segment_register(current, x86_seg_ss, &seg); + curr_lvl = seg.attr.fields.dpl; + } + if ( requested >= curr_lvl && emulate_privileged_op(regs) ) + return 0; + + hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); + return 0; +} + +static int pvh_ept_handle_violation(unsigned long qualification, + paddr_t gpa, struct cpu_user_regs *regs) +{ + unsigned long gla, gfn = gpa >> PAGE_SHIFT; + p2m_type_t p2mt; + mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt); + + gdprintk(XENLOG_G_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), " + "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n", + qualification, + (qualification & EPT_READ_VIOLATION) ? 'r' : '-', + (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-', + (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-', + (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-', + (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-', + (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-', + gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp); + + ept_walk_table(current->domain, gfn); + + if ( qualification & EPT_GLA_VALID ) + { + gla = __vmread(GUEST_LINEAR_ADDRESS); + gdprintk(XENLOG_G_ERR, " --- GLA %#lx\n", gla); + } + hvm_inject_hw_exception(TRAP_gp_fault, 0); + return 0; +} + +/* + * Main vm exit handler for PVH . Called from vmx_vmexit_handler(). + * Note: vmx_asm_vmexit_handler updates rip/rsp/eflags in regs{} struct. + */ +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs) +{ + unsigned long exit_qualification; + unsigned int exit_reason = __vmread(VM_EXIT_REASON); + int rc=0, ccpu = smp_processor_id(); + struct vcpu *v = current; + + dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx CR0:%lx\n", + ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, + __vmread(GUEST_CR0)); + + /* guest_kernel_mode() needs cs. read_segment_register needs others */ + read_vmcs_selectors(regs); + + switch ( (uint16_t)exit_reason ) + { + /* NMI and machine_check are handled by the caller, we handle rest here */ + case EXIT_REASON_EXCEPTION_NMI: /* 0 */ + rc = vmxit_exception(regs); + break; + + case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */ + break; /* handled in vmx_vmexit_handler() */ + + case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */ + /* Disable the interrupt window. */ + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + break; + + case EXIT_REASON_CPUID: /* 10 */ + pv_cpuid(regs); + vmx_update_guest_eip(); + break; + + case EXIT_REASON_HLT: /* 12 */ + vmx_update_guest_eip(); + hvm_hlt(regs->eflags); + break; + + case EXIT_REASON_VMCALL: /* 18 */ + rc = vmxit_vmcall(regs); + break; + + case EXIT_REASON_CR_ACCESS: /* 28 */ + rc = vmxit_cr_access(regs); + break; + + case EXIT_REASON_DR_ACCESS: /* 29 */ + exit_qualification = __vmread(EXIT_QUALIFICATION); + vmx_dr_access(exit_qualification, regs); + break; + + case EXIT_REASON_IO_INSTRUCTION: /* 30 */ + vmxit_io_instr(regs); + break; + + case EXIT_REASON_MSR_READ: /* 31 */ + rc = vmxit_msr_read(regs); + break; + + case EXIT_REASON_MSR_WRITE: /* 32 */ + rc = vmxit_msr_write(regs); + break; + + case EXIT_REASON_MONITOR_TRAP_FLAG: /* 37 */ + rc = vmxit_mtf(regs); + break; + + case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */ + break; /* handled in vmx_vmexit_handler() */ + + case EXIT_REASON_EPT_VIOLATION: /* 48 */ + { + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); + exit_qualification = __vmread(EXIT_QUALIFICATION); + rc = pvh_ept_handle_violation(exit_qualification, gpa, regs); + break; + } + + default: + rc = 1; + gdprintk(XENLOG_G_ERR, + "PVH: Unexpected exit reason:0x%x\n", exit_reason); + } + + if ( rc ) + { + exit_qualification = __vmread(EXIT_QUALIFICATION); + gdprintk(XENLOG_G_WARNING, + "PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx cr0:0x%016lx\n", + ccpu, exit_reason, exit_reason, exit_qualification, + exit_qualification, __vmread(GUEST_CR0)); + gdprintk(XENLOG_G_WARNING, "PVH: RIP:%lx RSP:%lx EFLAGS:%lx CR3:%lx\n", + regs->rip, regs->rsp, regs->rflags, __vmread(GUEST_CR3)); + domain_crash_synchronous(); + } +} + +/* + * Sets info for non boot SMP vcpu. VCPU 0 context is set by the library. + * In case of linux, the call comes from cpu_initialize_context(). + */ +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp) +{ + if ( v->vcpu_id == 0 ) + return 0; + + vmx_vmcs_enter(v); + __vmwrite(GUEST_GDTR_BASE, ctxtp->gdt.pvh.addr); + __vmwrite(GUEST_GDTR_LIMIT, ctxtp->gdt.pvh.limit); + __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user); + + __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs); + __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds); + __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es); + __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss); + __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs); + + if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) ) + { + vmx_vmcs_exit(v); + return -EINVAL; + } + vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel); + + vmx_vmcs_exit(v); + return 0; +} diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index 6fc0965..c9b5e23 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -471,6 +471,8 @@ void setup_ept_dump(void); void vmx_update_guest_eip(void); void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs *regs); +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs); +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp); int alloc_p2m_hap_data(struct p2m_domain *p2m); void free_p2m_hap_data(struct p2m_domain *p2m); -- 1.7.2.3