From mboxrd@z Thu Jan 1 00:00:00 1970 From: Tim Deegan Subject: Re: [PATCH 06/16] vmx: nest: handling VMX instruction exits Date: Mon, 13 Sep 2010 12:10:01 +0100 Message-ID: <20100913111001.GD3844@whitby.uk.xensource.com> References: <1283959344-3837-1-git-send-email-qing.he@intel.com> <1283959344-3837-7-git-send-email-qing.he@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Return-path: Content-Disposition: inline In-Reply-To: <1283959344-3837-7-git-send-email-qing.he@intel.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xensource.com Errors-To: xen-devel-bounces@lists.xensource.com To: Qing He Cc: Eddie, "xen-devel@lists.xensource.com" , Dong List-Id: xen-devel@lists.xenproject.org At 16:22 +0100 on 08 Sep (1283962934), Qing He wrote: > diff -r f1c1d3077337 xen/arch/x86/hvm/vmx/nest.c > +/* > + * VMX instructions support functions > + */ > + > +enum vmx_regs_enc { > + VMX_REG_RAX, > + VMX_REG_RCX, > + VMX_REG_RDX, > + VMX_REG_RBX, > + VMX_REG_RSP, > + VMX_REG_RBP, > + VMX_REG_RSI, > + VMX_REG_RDI, > +#ifdef CONFIG_X86_64 > + VMX_REG_R8, > + VMX_REG_R9, > + VMX_REG_R10, > + VMX_REG_R11, > + VMX_REG_R12, > + VMX_REG_R13, > + VMX_REG_R14, > + VMX_REG_R15, > +#endif > +}; > + > +enum vmx_sregs_enc { > + VMX_SREG_ES, > + VMX_SREG_CS, > + VMX_SREG_SS, > + VMX_SREG_DS, > + VMX_SREG_FS, > + VMX_SREG_GS, > +}; > + > +enum x86_segment sreg_to_index[] = { > + [VMX_SREG_ES] = x86_seg_es, > + [VMX_SREG_CS] = x86_seg_cs, > + [VMX_SREG_SS] = x86_seg_ss, > + [VMX_SREG_DS] = x86_seg_ds, > + [VMX_SREG_FS] = x86_seg_fs, > + [VMX_SREG_GS] = x86_seg_gs, > +}; Since you dislike adding new namespaces and translations, I'm sure you can get rid of these. :) It might even simplify some of the macros below. > +union vmx_inst_info { > + struct { > + unsigned int scaling :2; /* bit 0-1 */ > + unsigned int __rsvd0 :1; /* bit 2 */ > + unsigned int reg1 :4; /* bit 3-6 */ > + unsigned int addr_size :3; /* bit 7-9 */ > + unsigned int memreg :1; /* bit 10 */ > + unsigned int __rsvd1 :4; /* bit 11-14 */ > + unsigned int segment :3; /* bit 15-17 */ > + unsigned int index_reg :4; /* bit 18-21 */ > + unsigned int index_reg_invalid :1; /* bit 22 */ > + unsigned int base_reg :4; /* bit 23-26 */ > + unsigned int base_reg_invalid :1; /* bit 27 */ > + unsigned int reg2 :4; /* bit 28-31 */ > + } fields; > + u32 word; > +}; > + > +struct vmx_inst_decoded { > +#define VMX_INST_MEMREG_TYPE_MEMORY 0 > +#define VMX_INST_MEMREG_TYPE_REG 1 > + int type; > + union { > + struct { > + unsigned long mem; > + unsigned int len; > + }; > + enum vmx_regs_enc reg1; > + }; > + > + enum vmx_regs_enc reg2; > +}; > + > +enum vmx_ops_result { > + VMSUCCEED, > + VMFAIL_VALID, > + VMFAIL_INVALID, > +}; > + > +#define CASE_SET_REG(REG, reg) \ > + case VMX_REG_ ## REG: regs->reg = value; break > +#define CASE_GET_REG(REG, reg) \ > + case VMX_REG_ ## REG: value = regs->reg; break > + > +#define CASE_EXTEND_SET_REG \ > + CASE_EXTEND_REG(S) > +#define CASE_EXTEND_GET_REG \ > + CASE_EXTEND_REG(G) > + > +#ifdef __i386__ > +#define CASE_EXTEND_REG(T) > +#else > +#define CASE_EXTEND_REG(T) \ > + CASE_ ## T ## ET_REG(R8, r8); \ > + CASE_ ## T ## ET_REG(R9, r9); \ > + CASE_ ## T ## ET_REG(R10, r10); \ > + CASE_ ## T ## ET_REG(R11, r11); \ > + CASE_ ## T ## ET_REG(R12, r12); \ > + CASE_ ## T ## ET_REG(R13, r13); \ > + CASE_ ## T ## ET_REG(R14, r14); \ > + CASE_ ## T ## ET_REG(R15, r15) > +#endif > + > +static unsigned long reg_read(struct cpu_user_regs *regs, > + enum vmx_regs_enc index) > +{ > + unsigned long value = 0; > + > + switch ( index ) { > + CASE_GET_REG(RAX, eax); > + CASE_GET_REG(RCX, ecx); > + CASE_GET_REG(RDX, edx); > + CASE_GET_REG(RBX, ebx); > + CASE_GET_REG(RBP, ebp); > + CASE_GET_REG(RSI, esi); > + CASE_GET_REG(RDI, edi); > + CASE_GET_REG(RSP, esp); > + CASE_EXTEND_GET_REG; > + default: > + break; > + } > + > + return value; > +} > + > +static void reg_write(struct cpu_user_regs *regs, > + enum vmx_regs_enc index, > + unsigned long value) > +{ > + switch ( index ) { > + CASE_SET_REG(RAX, eax); > + CASE_SET_REG(RCX, ecx); > + CASE_SET_REG(RDX, edx); > + CASE_SET_REG(RBX, ebx); > + CASE_SET_REG(RBP, ebp); > + CASE_SET_REG(RSI, esi); > + CASE_SET_REG(RDI, edi); > + CASE_SET_REG(RSP, esp); > + CASE_EXTEND_SET_REG; > + default: > + break; > + } > +} > + > +static int decode_vmx_inst(struct cpu_user_regs *regs, > + struct vmx_inst_decoded *decode) > +{ > + struct vcpu *v = current; > + union vmx_inst_info info; > + struct segment_register seg; > + unsigned long base, index, seg_base, disp, offset; > + int scale; > + > + info.word = __vmread(VMX_INSTRUCTION_INFO); > + > + if ( info.fields.memreg ) { > + decode->type = VMX_INST_MEMREG_TYPE_REG; > + decode->reg1 = info.fields.reg1; > + } > + else > + { > + decode->type = VMX_INST_MEMREG_TYPE_MEMORY; > + hvm_get_segment_register(v, sreg_to_index[info.fields.segment], &seg); > + /* TODO: segment type check */ Indeed. :) > + seg_base = seg.base; > + > + base = info.fields.base_reg_invalid ? 0 : > + reg_read(regs, info.fields.base_reg); > + > + index = info.fields.index_reg_invalid ? 0 : > + reg_read(regs, info.fields.index_reg); > + > + scale = 1 << info.fields.scaling; > + > + disp = __vmread(EXIT_QUALIFICATION); > + > + offset = base + index * scale + disp; > + if ( offset > seg.limit ) DYM if ( offset + len > limit )? Would it be worth trying to fold this into the existing x86_emulate code, which already has careful memory access checks? > + goto gp_fault; > + > + decode->mem = seg_base + base + index * scale + disp; > + decode->len = 1 << (info.fields.addr_size + 1); > + } > + > + decode->reg2 = info.fields.reg2; > + > + return X86EMUL_OKAY; > + > +gp_fault: > + hvm_inject_exception(TRAP_gp_fault, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +static int vmx_inst_check_privilege(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct segment_register cs; > + > + hvm_get_segment_register(v, x86_seg_cs, &cs); > + > + if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) || > + !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) || > + (regs->eflags & X86_EFLAGS_VM) || > + (hvm_long_mode_enabled(v) && cs.attr.fields.l == 0) ) > + goto invalid_op; > + > + if ( (cs.sel & 3) > 0 ) > + goto gp_fault; > + > + return X86EMUL_OKAY; > + > +invalid_op: > + hvm_inject_exception(TRAP_invalid_op, 0, 0); > + return X86EMUL_EXCEPTION; > + > +gp_fault: > + hvm_inject_exception(TRAP_gp_fault, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +static void vmreturn(struct cpu_user_regs *regs, enum vmx_ops_result res) > +{ > + unsigned long eflags = regs->eflags; > + unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | > + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF; > + > + eflags &= ~mask; > + > + switch ( res ) { > + case VMSUCCEED: > + break; > + case VMFAIL_VALID: > + /* TODO: error number, useful for guest VMM debugging */ > + eflags |= X86_EFLAGS_ZF; > + break; > + case VMFAIL_INVALID: > + default: > + eflags |= X86_EFLAGS_CF; > + break; > + } > + > + regs->eflags = eflags; > +} > + > +static int __clear_current_vvmcs(struct vmx_nest_struct *nest) > +{ > + int rc; > + > + if ( nest->svmcs ) > + __vmpclear(virt_to_maddr(nest->svmcs)); > + > +#if !CONFIG_VVMCS_MAPPING > + rc = hvm_copy_to_guest_phys(nest->gvmcs_pa, nest->vvmcs, PAGE_SIZE); > + if ( rc != HVMCOPY_okay ) > + return X86EMUL_EXCEPTION; > +#endif > + > + nest->vmcs_valid = 0; > + > + return X86EMUL_OKAY; > +} > + > +/* > + * VMX instructions handling > + */ > + > +int vmx_nest_handle_vmxon(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + struct vmx_inst_decoded decode; > + unsigned long gpa = 0; > + int rc; > + > + if ( !is_nested_avail(v->domain) ) > + goto invalid_op; > + > + rc = vmx_inst_check_privilege(regs); > + if ( rc != X86EMUL_OKAY ) > + return rc; I think you could fold these checks and the goto target into decode_vmx_inst(), just to avoid repeating them in every vmx_nest_handle_* function. > + rc = decode_vmx_inst(regs, &decode); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY); > + rc = hvm_copy_from_guest_virt(&gpa, decode.mem, decode.len, 0); > + if ( rc != HVMCOPY_okay ) > + return X86EMUL_EXCEPTION; > + > + nest->guest_vmxon_pa = gpa; > + nest->gvmcs_pa = 0; > + nest->vmcs_valid = 0; > +#if !CONFIG_VVMCS_MAPPING > + nest->vvmcs = alloc_xenheap_page(); > + if ( !nest->vvmcs ) > + { > + gdprintk(XENLOG_ERR, "nest: allocation for virtual vmcs failed\n"); > + vmreturn(regs, VMFAIL_INVALID); > + goto out; > + } > +#endif > + nest->svmcs = alloc_xenheap_page(); > + if ( !nest->svmcs ) > + { > + gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n"); > + free_xenheap_page(nest->vvmcs); > + vmreturn(regs, VMFAIL_INVALID); > + goto out; > + } > + > + /* > + * `fork' the host vmcs to shadow_vmcs > + * vmcs_lock is not needed since we are on current > + */ > + nest->hvmcs = v->arch.hvm_vmx.vmcs; > + __vmpclear(virt_to_maddr(nest->hvmcs)); > + memcpy(nest->svmcs, nest->hvmcs, PAGE_SIZE); > + __vmptrld(virt_to_maddr(nest->hvmcs)); > + v->arch.hvm_vmx.launched = 0; > + > + vmreturn(regs, VMSUCCEED); > + > +out: > + return X86EMUL_OKAY; > + > +invalid_op: > + hvm_inject_exception(TRAP_invalid_op, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +int vmx_nest_handle_vmxoff(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + int rc; > + > + if ( unlikely(!nest->guest_vmxon_pa) ) > + goto invalid_op; > + > + rc = vmx_inst_check_privilege(regs); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + nest->guest_vmxon_pa = 0; > + __vmpclear(virt_to_maddr(nest->svmcs)); > + > +#if !CONFIG_VVMCS_MAPPING > + free_xenheap_page(nest->vvmcs); > +#endif > + free_xenheap_page(nest->svmcs); These also need to be freed on domain teardown. > + vmreturn(regs, VMSUCCEED); > + return X86EMUL_OKAY; > + > +invalid_op: > + hvm_inject_exception(TRAP_invalid_op, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +int vmx_nest_handle_vmptrld(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct vmx_inst_decoded decode; > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + unsigned long gpa = 0; > + int rc; > + > + if ( unlikely(!nest->guest_vmxon_pa) ) > + goto invalid_op; > + > + rc = vmx_inst_check_privilege(regs); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + rc = decode_vmx_inst(regs, &decode); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY); > + rc = hvm_copy_from_guest_virt(&gpa, decode.mem, decode.len, 0); > + if ( rc != HVMCOPY_okay ) > + return X86EMUL_EXCEPTION; > + > + if ( gpa == nest->guest_vmxon_pa || gpa & 0xfff ) > + { > + vmreturn(regs, VMFAIL_INVALID); > + goto out; > + } > + > + if ( nest->gvmcs_pa != gpa ) > + { > + if ( nest->vmcs_valid ) > + { > + rc = __clear_current_vvmcs(nest); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + } > + nest->gvmcs_pa = gpa; > + ASSERT(nest->vmcs_valid == 0); > + } > + > + > + if ( !nest->vmcs_valid ) > + { > +#if CONFIG_VVMCS_MAPPING > + unsigned long mfn; > + p2m_type_t p2mt; > + > + mfn = mfn_x(gfn_to_mfn(p2m_get_hostp2m(v->domain), > + nest->gvmcs_pa >> PAGE_SHIFT, &p2mt)); > + nest->vvmcs = map_domain_page_global(mfn); > +#else > + rc = hvm_copy_from_guest_phys(nest->vvmcs, nest->gvmcs_pa, PAGE_SIZE); > + if ( rc != HVMCOPY_okay ) > + return X86EMUL_EXCEPTION; > +#endif > + nest->vmcs_valid = 1; > + } > + > + vmreturn(regs, VMSUCCEED); > + > +out: > + return X86EMUL_OKAY; > + > +invalid_op: > + hvm_inject_exception(TRAP_invalid_op, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +int vmx_nest_handle_vmptrst(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct vmx_inst_decoded decode; > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + unsigned long gpa = 0; > + int rc; > + > + if ( unlikely(!nest->guest_vmxon_pa) ) > + goto invalid_op; > + > + rc = vmx_inst_check_privilege(regs); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + rc = decode_vmx_inst(regs, &decode); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY); > + > + gpa = nest->gvmcs_pa; > + > + rc = hvm_copy_to_guest_virt(decode.mem, &gpa, decode.len, 0); > + if ( rc != HVMCOPY_okay ) > + return X86EMUL_EXCEPTION; > + > + vmreturn(regs, VMSUCCEED); > + return X86EMUL_OKAY; > + > +invalid_op: > + hvm_inject_exception(TRAP_invalid_op, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +int vmx_nest_handle_vmclear(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct vmx_inst_decoded decode; > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + unsigned long gpa = 0; > + int rc; > + > + if ( unlikely(!nest->guest_vmxon_pa) ) > + goto invalid_op; > + > + rc = vmx_inst_check_privilege(regs); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + rc = decode_vmx_inst(regs, &decode); > + if ( rc != X86EMUL_OKAY ) > + return rc; > + > + ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY); > + rc = hvm_copy_from_guest_virt(&gpa, decode.mem, decode.len, 0); Is it guaranteed that decode.len is always <= sizeof gpa here, even with a malicious guest? Cheers, Tim. -- Tim Deegan Principal Software Engineer, XenServer Engineering Citrix Systems UK Ltd. (Company #02937203, SL9 0BG)