From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1753894AbcFPKE7 (ORCPT <rfc822;w@1wt.eu>);
	Thu, 16 Jun 2016 06:04:59 -0400
Received: from mx1.redhat.com ([209.132.183.28]:38819 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1751323AbcFPKE5 (ORCPT <rfc822;linux-kernel@vger.kernel.org>);
	Thu, 16 Jun 2016 06:04:57 -0400
Subject: Re: [PATCH v2 3/3] KVM: VMX: enable guest access to LMCE related MSRs
To: Haozhong Zhang <haozhong.zhang@intel.com>, kvm@vger.kernel.org
References: <20160616060531.30028-1-haozhong.zhang@intel.com>
 <20160616060531.30028-4-haozhong.zhang@intel.com>
Cc: rkrcmar@redhat.com, Thomas Gleixner <tglx@linutronix.de>,
        Ingo Molnar <mingo@redhat.com>, "H . Peter Anvin" <hpa@zytor.com>,
        x86@kernel.org, linux-kernel@vger.kernel.org,
        Gleb Natapov <gleb@kernel.org>, Boris Petkov <bp@suse.de>,
        Tony Luck <tony.luck@intel.com>, Andi Kleen <andi.kleen@intel.com>,
        Ashok Raj <ashok.raj@intel.com>, Eduardo Habkost <ehabkost@redhat.com>
From: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <1785f481-033b-2780-37b8-d27f80276d33@redhat.com>
Date: Thu, 16 Jun 2016 12:04:50 +0200
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101
 Thunderbird/45.1.0
MIME-Version: 1.0
In-Reply-To: <20160616060531.30028-4-haozhong.zhang@intel.com>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.38]); Thu, 16 Jun 2016 10:04:57 +0000 (UTC)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org


On 16/06/2016 08:05, Haozhong Zhang wrote:
> From: Ashok Raj <ashok.raj@intel.com>
> 
> On Intel platforms, this patch adds LMCE to KVM MCE supported
> capabilities and handles guest access to LMCE related MSRs.
> 
> Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
>            Only enable LMCE on Intel platform
> 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> 	     access to MSR_IA32_MCG_EXT_CTL]
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 +++++
>  arch/x86/kvm/vmx.c              | 36 +++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              | 15 +++++++++------
>  3 files changed, 49 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e0fbe7e..75defa6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -598,6 +598,7 @@ struct kvm_vcpu_arch {
>  	u64 mcg_cap;
>  	u64 mcg_status;
>  	u64 mcg_ctl;
> +	u64 mcg_ext_ctl;
>  	u64 *mce_banks;
>  
>  	/* Cache MMIO info */
> @@ -1005,6 +1006,8 @@ struct kvm_x86_ops {
>  	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
>  			      uint32_t guest_irq, bool set);
>  	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
> +
> +	void (*setup_mce)(struct kvm_vcpu *vcpu);
>  };
>  
>  struct kvm_arch_async_pf {
> @@ -1077,6 +1080,8 @@ extern u8   kvm_tsc_scaling_ratio_frac_bits;
>  /* maximum allowed value of TSC scaling ratio */
>  extern u64  kvm_max_tsc_scaling_ratio;
>  
> +extern u64 kvm_mce_cap_supported;
> +
>  enum emulation_result {
>  	EMULATE_DONE,         /* no further processing */
>  	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 1dc89c5..42db42e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -638,7 +638,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
>   * feature_control_valid_bits_add/del(), so it's not included here.
>   */
>  #define FEATURE_CONTROL_MAX_VALID_BITS \
> -	FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX
> +	(FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE)
>  
>  static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits)
>  {
> @@ -2905,6 +2905,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>  	return valid_bits && !(val & ~valid_bits);
>  }
>  
> +static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu,
> +						bool host_initiated)
> +{
> +	return (vcpu->arch.mcg_cap & MCG_LMCE_P) &&

Checking MCG_LMCE_P is unnecessary, because you cannot set
FEATURE_CONTROL_LMCE unless MCG_LMCE_P is present.

You can just inline this function in the callers, it's simpler.

> +		(host_initiated ||
> +		 (to_vmx(vcpu)->msr_ia32_feature_control &
> +		  FEATURE_CONTROL_LMCE));
> +}
> +
>  /*
>   * Reads an msr value (of 'msr_index') into 'pdata'.
>   * Returns 0 on success, non-0 otherwise.
> @@ -2946,6 +2955,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
>  		break;
> +	case MSR_IA32_MCG_EXT_CTL:
> +		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
> +						  msr_info->host_initiated))
> +			return 1;
> +		msr_info->data = vcpu->arch.mcg_ext_ctl;
> +		break;
>  	case MSR_IA32_FEATURE_CONTROL:
>  		if (!vmx_feature_control_msr_valid(vcpu, 0))
>  			return 1;
> @@ -3039,6 +3054,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	case MSR_IA32_TSC_ADJUST:
>  		ret = kvm_set_msr_common(vcpu, msr_info);
>  		break;
> +	case MSR_IA32_MCG_EXT_CTL:
> +		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
> +						  msr_info->host_initiated) ||
> +		    (data & ~MCG_EXT_CTL_LMCE_EN))
> +			return 1;
> +		vcpu->arch.mcg_ext_ctl = data;
> +		break;
>  	case MSR_IA32_FEATURE_CONTROL:
>  		if (!vmx_feature_control_msr_valid(vcpu, data) ||
>  		    (to_vmx(vcpu)->msr_ia32_feature_control &
> @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
>  
>  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
>  
> +	kvm_mce_cap_supported |= MCG_LMCE_P;

Ah, so virtual LMCE is available on all processors!  This is
interesting, but it also makes it more complicated to handle in QEMU; a
new QEMU generally doesn't require a new kernel.

Eduardo, any ideas?

Thanks,

Paolo

>  	return alloc_kvm_area();
>  
>  out8:
> @@ -10950,6 +10974,14 @@ out:
>  	return ret;
>  }
>  
> +static void vmx_setup_mce(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
> +		feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE);
> +	else
> +		feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE);
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -11074,6 +11106,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.pmu_ops = &intel_pmu_ops,
>  
>  	.update_pi_irte = vmx_update_pi_irte,
> +
> +	.setup_mce = vmx_setup_mce,
>  };
>  
>  static int __init vmx_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index bf22721..5bf76ab 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -70,7 +70,8 @@
>  
>  #define MAX_IO_MSRS 256
>  #define KVM_MAX_MCE_BANKS 32
> -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
> +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
>  
>  #define emul_to_vcpu(ctxt) \
>  	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
> @@ -983,6 +984,7 @@ static u32 emulated_msrs[] = {
>  	MSR_IA32_MISC_ENABLE,
>  	MSR_IA32_MCG_STATUS,
>  	MSR_IA32_MCG_CTL,
> +	MSR_IA32_MCG_EXT_CTL,
>  	MSR_IA32_SMBASE,
>  };
>  
> @@ -2684,11 +2686,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
>  		break;
>  	}
>  	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
> -		u64 mce_cap;
> -
> -		mce_cap = KVM_MCE_CAP_SUPPORTED;
>  		r = -EFAULT;
> -		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
> +		if (copy_to_user(argp, &kvm_mce_cap_supported,
> +				 sizeof(kvm_mce_cap_supported)))
>  			goto out;
>  		r = 0;
>  		break;
> @@ -2866,7 +2866,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>  	r = -EINVAL;
>  	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
>  		goto out;
> -	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
> +	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
>  		goto out;
>  	r = 0;
>  	vcpu->arch.mcg_cap = mcg_cap;
> @@ -2876,6 +2876,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>  	/* Init IA32_MCi_CTL to all 1s */
>  	for (bank = 0; bank < bank_num; bank++)
>  		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
> +
> +	if (kvm_x86_ops->setup_mce)
> +		kvm_x86_ops->setup_mce(vcpu);
>  out:
>  	return r;
>  }
>