From mboxrd@z Thu Jan  1 00:00:00 1970
From: Tim Deegan <Tim.Deegan@citrix.com>
Subject: Re: [PATCH 06/16] vmx: nest: handling VMX instruction
	exits
Date: Mon, 13 Sep 2010 12:10:01 +0100
Message-ID: <20100913111001.GD3844@whitby.uk.xensource.com>
References: <1283959344-3837-1-git-send-email-qing.he@intel.com>
	<1283959344-3837-7-git-send-email-qing.he@intel.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="iso-8859-1"
Return-path: <xen-devel-bounces@lists.xensource.com>
Content-Disposition: inline
In-Reply-To: <1283959344-3837-7-git-send-email-qing.he@intel.com>
List-Unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xensource.com>
List-Help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-Subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
Sender: xen-devel-bounces@lists.xensource.com
Errors-To: xen-devel-bounces@lists.xensource.com
To: Qing He <qing.he@intel.com>
Cc: Eddie, "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>, Dong <eddie.dong@intel.com>
List-Id: xen-devel@lists.xenproject.org

At 16:22 +0100 on 08 Sep (1283962934), Qing He wrote:
> diff -r f1c1d3077337 xen/arch/x86/hvm/vmx/nest.c
> +/*
> + * VMX instructions support functions
> + */
> +
> +enum vmx_regs_enc {
> +    VMX_REG_RAX,
> +    VMX_REG_RCX,
> +    VMX_REG_RDX,
> +    VMX_REG_RBX,
> +    VMX_REG_RSP,
> +    VMX_REG_RBP,
> +    VMX_REG_RSI,
> +    VMX_REG_RDI,
> +#ifdef CONFIG_X86_64
> +    VMX_REG_R8,
> +    VMX_REG_R9,
> +    VMX_REG_R10,
> +    VMX_REG_R11,
> +    VMX_REG_R12,
> +    VMX_REG_R13,
> +    VMX_REG_R14,
> +    VMX_REG_R15,
> +#endif
> +};
> +
> +enum vmx_sregs_enc {
> +    VMX_SREG_ES,
> +    VMX_SREG_CS,
> +    VMX_SREG_SS,
> +    VMX_SREG_DS,
> +    VMX_SREG_FS,
> +    VMX_SREG_GS,
> +};
> +
> +enum x86_segment sreg_to_index[] = {
> +    [VMX_SREG_ES] = x86_seg_es,
> +    [VMX_SREG_CS] = x86_seg_cs,
> +    [VMX_SREG_SS] = x86_seg_ss,
> +    [VMX_SREG_DS] = x86_seg_ds,
> +    [VMX_SREG_FS] = x86_seg_fs,
> +    [VMX_SREG_GS] = x86_seg_gs,
> +};

Since you dislike adding new namespaces and translations, I'm sure you
can get rid of these. :)  It might even simplify some of the macros
below. 

> +union vmx_inst_info {
> +    struct {
> +        unsigned int scaling           :2; /* bit 0-1 */
> +        unsigned int __rsvd0           :1; /* bit 2 */
> +        unsigned int reg1              :4; /* bit 3-6 */
> +        unsigned int addr_size         :3; /* bit 7-9 */
> +        unsigned int memreg            :1; /* bit 10 */
> +        unsigned int __rsvd1           :4; /* bit 11-14 */
> +        unsigned int segment           :3; /* bit 15-17 */
> +        unsigned int index_reg         :4; /* bit 18-21 */
> +        unsigned int index_reg_invalid :1; /* bit 22 */
> +        unsigned int base_reg          :4; /* bit 23-26 */
> +        unsigned int base_reg_invalid  :1; /* bit 27 */
> +        unsigned int reg2              :4; /* bit 28-31 */
> +    } fields;
> +    u32 word;
> +};
> +
> +struct vmx_inst_decoded {
> +#define VMX_INST_MEMREG_TYPE_MEMORY 0
> +#define VMX_INST_MEMREG_TYPE_REG    1
> +    int type;
> +    union {
> +        struct {
> +            unsigned long mem;
> +            unsigned int  len;
> +        };
> +        enum vmx_regs_enc reg1;
> +    };
> +
> +    enum vmx_regs_enc reg2;
> +};
> +
> +enum vmx_ops_result {
> +    VMSUCCEED,
> +    VMFAIL_VALID,
> +    VMFAIL_INVALID,
> +};
> +
> +#define CASE_SET_REG(REG, reg)      \
> +    case VMX_REG_ ## REG: regs->reg = value; break
> +#define CASE_GET_REG(REG, reg)      \
> +    case VMX_REG_ ## REG: value = regs->reg; break
> +
> +#define CASE_EXTEND_SET_REG         \
> +    CASE_EXTEND_REG(S)
> +#define CASE_EXTEND_GET_REG         \
> +    CASE_EXTEND_REG(G)
> +
> +#ifdef __i386__
> +#define CASE_EXTEND_REG(T)
> +#else
> +#define CASE_EXTEND_REG(T)          \
> +    CASE_ ## T ## ET_REG(R8, r8);   \
> +    CASE_ ## T ## ET_REG(R9, r9);   \
> +    CASE_ ## T ## ET_REG(R10, r10); \
> +    CASE_ ## T ## ET_REG(R11, r11); \
> +    CASE_ ## T ## ET_REG(R12, r12); \
> +    CASE_ ## T ## ET_REG(R13, r13); \
> +    CASE_ ## T ## ET_REG(R14, r14); \
> +    CASE_ ## T ## ET_REG(R15, r15)
> +#endif
> +
> +static unsigned long reg_read(struct cpu_user_regs *regs,
> +                              enum vmx_regs_enc index)
> +{
> +    unsigned long value = 0;
> +
> +    switch ( index ) {
> +    CASE_GET_REG(RAX, eax);
> +    CASE_GET_REG(RCX, ecx);
> +    CASE_GET_REG(RDX, edx);
> +    CASE_GET_REG(RBX, ebx);
> +    CASE_GET_REG(RBP, ebp);
> +    CASE_GET_REG(RSI, esi);
> +    CASE_GET_REG(RDI, edi);
> +    CASE_GET_REG(RSP, esp);
> +    CASE_EXTEND_GET_REG;
> +    default:
> +        break;
> +    }
> +
> +    return value;
> +}
> +
> +static void reg_write(struct cpu_user_regs *regs,
> +                      enum vmx_regs_enc index,
> +                      unsigned long value)
> +{
> +    switch ( index ) {
> +    CASE_SET_REG(RAX, eax);
> +    CASE_SET_REG(RCX, ecx);
> +    CASE_SET_REG(RDX, edx);
> +    CASE_SET_REG(RBX, ebx);
> +    CASE_SET_REG(RBP, ebp);
> +    CASE_SET_REG(RSI, esi);
> +    CASE_SET_REG(RDI, edi);
> +    CASE_SET_REG(RSP, esp);
> +    CASE_EXTEND_SET_REG;
> +    default:
> +        break;
> +    }
> +}
> +
> +static int decode_vmx_inst(struct cpu_user_regs *regs,
> +                           struct vmx_inst_decoded *decode)
> +{
> +    struct vcpu *v = current;
> +    union vmx_inst_info info;
> +    struct segment_register seg;
> +    unsigned long base, index, seg_base, disp, offset;
> +    int scale;
> +
> +    info.word = __vmread(VMX_INSTRUCTION_INFO);
> +
> +    if ( info.fields.memreg ) {
> +        decode->type = VMX_INST_MEMREG_TYPE_REG;
> +        decode->reg1 = info.fields.reg1;
> +    }
> +    else
> +    {
> +        decode->type = VMX_INST_MEMREG_TYPE_MEMORY;
> +        hvm_get_segment_register(v, sreg_to_index[info.fields.segment], &seg);
> +        /* TODO: segment type check */

Indeed. :)

> +        seg_base = seg.base;
> +
> +        base = info.fields.base_reg_invalid ? 0 :
> +            reg_read(regs, info.fields.base_reg);
> +
> +        index = info.fields.index_reg_invalid ? 0 :
> +            reg_read(regs, info.fields.index_reg);
> +
> +        scale = 1 << info.fields.scaling;
> +
> +        disp = __vmread(EXIT_QUALIFICATION);
> +
> +        offset = base + index * scale + disp;
> +        if ( offset > seg.limit )

DYM if ( offset + len > limit )?

Would it be worth trying to fold this into the existing x86_emulate
code, which already has careful memory access checks?

> +            goto gp_fault;
> +
> +        decode->mem = seg_base + base + index * scale + disp;
> +        decode->len = 1 << (info.fields.addr_size + 1);
> +    }
> +
> +    decode->reg2 = info.fields.reg2;
> +
> +    return X86EMUL_OKAY;
> +
> +gp_fault:
> +    hvm_inject_exception(TRAP_gp_fault, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +static int vmx_inst_check_privilege(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct segment_register cs;
> +
> +    hvm_get_segment_register(v, x86_seg_cs, &cs);
> +
> +    if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ||
> +         !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) ||
> +         (regs->eflags & X86_EFLAGS_VM) ||
> +         (hvm_long_mode_enabled(v) && cs.attr.fields.l == 0) )
> +        goto invalid_op;
> +
> +    if ( (cs.sel & 3) > 0 )
> +        goto gp_fault;
> +
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +
> +gp_fault:
> +    hvm_inject_exception(TRAP_gp_fault, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +static void vmreturn(struct cpu_user_regs *regs, enum vmx_ops_result res)
> +{
> +    unsigned long eflags = regs->eflags;
> +    unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
> +                         X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF;
> +
> +    eflags &= ~mask;
> +
> +    switch ( res ) {
> +    case VMSUCCEED:
> +        break;
> +    case VMFAIL_VALID:
> +        /* TODO: error number, useful for guest VMM debugging */
> +        eflags |= X86_EFLAGS_ZF;
> +        break;
> +    case VMFAIL_INVALID:
> +    default:
> +        eflags |= X86_EFLAGS_CF;
> +        break;
> +    }
> +
> +    regs->eflags = eflags;
> +}
> +
> +static int __clear_current_vvmcs(struct vmx_nest_struct *nest)
> +{
> +    int rc;
> +
> +    if ( nest->svmcs )
> +        __vmpclear(virt_to_maddr(nest->svmcs));
> +
> +#if !CONFIG_VVMCS_MAPPING
> +    rc = hvm_copy_to_guest_phys(nest->gvmcs_pa, nest->vvmcs, PAGE_SIZE);
> +    if ( rc != HVMCOPY_okay )
> +        return X86EMUL_EXCEPTION;
> +#endif
> +
> +    nest->vmcs_valid = 0;
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +/*
> + * VMX instructions handling
> + */
> +
> +int vmx_nest_handle_vmxon(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    struct vmx_inst_decoded decode;
> +    unsigned long gpa = 0;
> +    int rc;
> +
> +    if ( !is_nested_avail(v->domain) )
> +        goto invalid_op;
> + 
> +    rc = vmx_inst_check_privilege(regs);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;

I think you could fold these checks and the goto target into
decode_vmx_inst(), just to avoid repeating them in every
vmx_nest_handle_* function. 

> +    rc = decode_vmx_inst(regs, &decode);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY);
> +    rc = hvm_copy_from_guest_virt(&gpa, decode.mem, decode.len, 0);
> +    if ( rc != HVMCOPY_okay )
> +        return X86EMUL_EXCEPTION;
> +
> +    nest->guest_vmxon_pa = gpa;
> +    nest->gvmcs_pa = 0;
> +    nest->vmcs_valid = 0;
> +#if !CONFIG_VVMCS_MAPPING
> +    nest->vvmcs = alloc_xenheap_page();
> +    if ( !nest->vvmcs )
> +    {
> +        gdprintk(XENLOG_ERR, "nest: allocation for virtual vmcs failed\n");
> +        vmreturn(regs, VMFAIL_INVALID);
> +        goto out;
> +    }
> +#endif
> +    nest->svmcs = alloc_xenheap_page();
> +    if ( !nest->svmcs )
> +    {
> +        gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n");
> +        free_xenheap_page(nest->vvmcs);
> +        vmreturn(regs, VMFAIL_INVALID);
> +        goto out;
> +    }
> +
> +    /*
> +     * `fork' the host vmcs to shadow_vmcs
> +     * vmcs_lock is not needed since we are on current
> +     */
> +    nest->hvmcs = v->arch.hvm_vmx.vmcs;
> +    __vmpclear(virt_to_maddr(nest->hvmcs));
> +    memcpy(nest->svmcs, nest->hvmcs, PAGE_SIZE);
> +    __vmptrld(virt_to_maddr(nest->hvmcs));
> +    v->arch.hvm_vmx.launched = 0;
> +
> +    vmreturn(regs, VMSUCCEED);
> +
> +out:
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_handle_vmxoff(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    int rc;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    rc = vmx_inst_check_privilege(regs);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    nest->guest_vmxon_pa = 0;
> +    __vmpclear(virt_to_maddr(nest->svmcs));
> +
> +#if !CONFIG_VVMCS_MAPPING
> +    free_xenheap_page(nest->vvmcs);
> +#endif
> +    free_xenheap_page(nest->svmcs);

These also need to be freed on domain teardown.

> +    vmreturn(regs, VMSUCCEED);
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_handle_vmptrld(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_inst_decoded decode;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    unsigned long gpa = 0;
> +    int rc;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    rc = vmx_inst_check_privilege(regs);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    rc = decode_vmx_inst(regs, &decode);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY);
> +    rc = hvm_copy_from_guest_virt(&gpa, decode.mem, decode.len, 0);
> +    if ( rc != HVMCOPY_okay )
> +        return X86EMUL_EXCEPTION;
> +
> +    if ( gpa == nest->guest_vmxon_pa || gpa & 0xfff )
> +    {
> +        vmreturn(regs, VMFAIL_INVALID);
> +        goto out;
> +    }
> +
> +    if ( nest->gvmcs_pa != gpa )
> +    {
> +        if ( nest->vmcs_valid )
> +        {
> +            rc = __clear_current_vvmcs(nest);
> +            if ( rc != X86EMUL_OKAY )
> +                return rc;
> +        }
> +        nest->gvmcs_pa = gpa;
> +        ASSERT(nest->vmcs_valid == 0);
> +    }
> +
> +
> +    if ( !nest->vmcs_valid )
> +    {
> +#if CONFIG_VVMCS_MAPPING
> +        unsigned long mfn;
> +        p2m_type_t p2mt;
> +
> +        mfn = mfn_x(gfn_to_mfn(p2m_get_hostp2m(v->domain),
> +                               nest->gvmcs_pa >> PAGE_SHIFT, &p2mt));
> +        nest->vvmcs = map_domain_page_global(mfn);
> +#else
> +        rc = hvm_copy_from_guest_phys(nest->vvmcs, nest->gvmcs_pa, PAGE_SIZE);
> +        if ( rc != HVMCOPY_okay )
> +            return X86EMUL_EXCEPTION;
> +#endif
> +        nest->vmcs_valid = 1;
> +    }
> +
> +    vmreturn(regs, VMSUCCEED);
> +
> +out:
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_handle_vmptrst(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_inst_decoded decode;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    unsigned long gpa = 0;
> +    int rc;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    rc = vmx_inst_check_privilege(regs);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    rc = decode_vmx_inst(regs, &decode);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY);
> +
> +    gpa = nest->gvmcs_pa;
> +
> +    rc = hvm_copy_to_guest_virt(decode.mem, &gpa, decode.len, 0);
> +    if ( rc != HVMCOPY_okay )
> +        return X86EMUL_EXCEPTION;
> +
> +    vmreturn(regs, VMSUCCEED);
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_handle_vmclear(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_inst_decoded decode;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    unsigned long gpa = 0;
> +    int rc;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    rc = vmx_inst_check_privilege(regs);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    rc = decode_vmx_inst(regs, &decode);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    ASSERT(decode.type == VMX_INST_MEMREG_TYPE_MEMORY);
> +    rc = hvm_copy_from_guest_virt(&gpa, decode.mem, decode.len, 0);

Is it guaranteed that decode.len is always <= sizeof gpa here, even with
a malicious guest?

Cheers,

Tim.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)